memblock

前两天，看系统初始化时解析dtb的时候，发现首先会查找memory类型的节点并由memblock管理。

那么，现在就来分析一下memblock吧！

继续由dtb相关的接口开始分析，parse_dtb会查找到memory类型节点，确定memory节点对应的base以及size，最终会调用memblock_add函数。

memblock_add：

int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
	phys_addr_t end = base + size - 1;

	memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
		     &base, &end, (void *)_RET_IP_);

	return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
}

由参数可知，memblock是一个全局变量（是不是memblock最顶层由一个数据结构进行管理？）。

struct memblock {
	bool bottom_up;  /* is bottom up direction? */
	phys_addr_t current_limit;
	struct memblock_type memory;
	struct memblock_type reserved;
};

extern struct memblock memblock;

struct memblock中的memory以及reserved成员都是struct memblock_type数据结构。

struct memblock_type {
	unsigned long cnt;
	unsigned long max;
	phys_addr_t total_size;
	struct memblock_region *regions;
	char *name;
};

不难看出，memblock_type会管理一个一个不同的block（或者称为region），使用struct memblock_region的数组进行管理。cnt应该为region的总数。max为region的分配总数。（cnt ≤ max）。

struct memblock_region为一个单独的memory block。

struct memblock_region {
	phys_addr_t base;
	phys_addr_t size;
	enum memblock_flags flags;
#ifdef CONFIG_NUMA
	int nid;
#endif
};

从数据结构的关系上，不难看出，struct memblock是最顶层的数据结构，它有两个成员memory和reserved，分别代表正常的内存以及保留的内存，由数据结构struct memblock_type表示。struct memblock_type中有struct memblock_region的数组，用于表示具体的一个个block。

（还有一个问题，既然memblock是一个全局变量，那么它的值是在哪里进行初始化的呢？毕竟我们看到的时候就直接访问它的memory以及memory后续的region成员了！）

在/mm/memblock.c中：

static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;

struct memblock memblock __initdata_memblock = {
	.memory.regions		= memblock_memory_init_regions,
	.memory.cnt		= 1,	/* empty dummy entry */
	.memory.max		= INIT_MEMBLOCK_MEMORY_REGIONS,
	.memory.name		= "memory",

	.reserved.regions	= memblock_reserved_init_regions,
	.reserved.cnt		= 1,	/* empty dummy entry */
	.reserved.max		= INIT_MEMBLOCK_RESERVED_REGIONS,
	.reserved.name		= "reserved",

	.bottom_up		= false,
	.current_limit		= MEMBLOCK_ALLOC_ANYWHERE,
};

可以看到是静态分配了各种空间。

现在继续回到memblock_add_range函数。

（?蛮长的，慢慢看）

static int __init_memblock memblock_add_range(struct memblock_type *type,
				phys_addr_t base, phys_addr_t size,
				int nid, enum memblock_flags flags)
{
	bool insert = false;
	phys_addr_t obase = base;
	// 这里确定end不超过最大值
	phys_addr_t end = base + memblock_cap_size(base, &size);
	int idx, nr_new, start_rgn = -1, end_rgn;
	struct memblock_region *rgn;

	if (!size)
		return 0;

	/* special case for empty array */

	// regions数组为空的情况
	// 直接填充第一个成员
	if (type->regions[0].size == 0) {
		WARN_ON(type->cnt != 1 || type->total_size);
		type->regions[0].base = base;
		type->regions[0].size = size;
		type->regions[0].flags = flags;
		memblock_set_region_node(&type->regions[0], nid);
		type->total_size = size;
		return 0;
	}

	/*
	 * The worst case is when new range overlaps all existing regions,
	 * then we'll need type->cnt + 1 empty regions in @type. So if
	 * type->cnt * 2 + 1 is less than or equal to type->max, we know
	 * that there is enough empty regions in @type, and we can insert
	 * regions directly.
	 */
	// 如果当前cnt * 2 + 1 <= max，则直接设置insert为true
	// 也就是如果非这种情况，有可能region得扩展，重新申请空间
	if (type->cnt * 2 + 1 <= type->max)
		insert = true;

repeat:
	/*
	 * The following is executed twice.  Once with %false @insert and
	 * then with %true.  The first counts the number of regions needed
	 * to accommodate the new area.  The second actually inserts them.
	 */
	base = obase;
	nr_new = 0;

	for_each_memblock_type(idx, type, rgn) {
		// 遍历memblock+_region
		// rbase为当前region节点的base
		// rend为当前region节点的end
		phys_addr_t rbase = rgn->base;
		phys_addr_t rend = rbase + rgn->size;

		// （看来这里的regions也是按照地址从低到高的顺序排列的）
		// 如果region的base大于当前要插入区域的end，则代表没有重叠，可以插入
		if (rbase >= end)
			break;
		// 如果region的end小于当前要插入区域的base，则代表寻找下一个
		if (rend <= base)
			continue;
		/*
		 * @rgn overlaps.  If it separates the lower part of new
		 * area, insert that portion.
		 */
		// 此时已经确定重叠了，如果当前节点的base大于要插入的base，则代表重叠区域更大了
		if (rbase > base) {
#ifdef CONFIG_NUMA
			WARN_ON(nid != memblock_get_region_node(rgn));
#endif
			WARN_ON(flags != rgn->flags);
			nr_new++;

			// 根据刚开始计算的insert确定要不要插入
			if (insert) {
				if (start_rgn == -1)
					start_rgn = idx;
				end_rgn = idx + 1;
				memblock_insert_region(type, idx++, base,
						       rbase - base, nid,
						       flags);
			}
		}
		/* area below @rend is dealt with, forget about it */
		// 假如说，没有插入，base修改为rend和end中较前的那一个
		base = min(rend, end);
	}

	/* insert the remaining portion */
	// 如果base < end，则代表rend小于end，nr_new++
	if (base < end) {
		nr_new++;
		if (insert) {
			if (start_rgn == -1)
				start_rgn = idx;
			end_rgn = idx + 1;
			memblock_insert_region(type, idx, base, end - base,
					       nid, flags);
		}
	}

	if (!nr_new)
		return 0;

	/*
	 * If this was the first round, resize array and repeat for actual
	 * insertions; otherwise, merge and return.
	 */
	if (!insert) {
		// 扩展区域
		while (type->cnt + nr_new > type->max)
			if (memblock_double_array(type, obase, size) < 0)
				return -ENOMEM;
		// 设置insert为true
		insert = true;
		goto repeat;
	} else {
		memblock_merge_regions(type, start_rgn, end_rgn);
		return 0;
	}
}

这里的代码和dtb创建device tree的代码很类似，第一次确定device tree需要分配的内存大小，第二次填充数据；而这里是如果第一次空间不是很足够，则第一次确定要增加多少，之后确定要不要增大空间，第二次再插入具体的region。

（存放memblock的内存是谁管的呢？？？）

下面先看一下memblock_double_array函数（这里负责延伸整个array空间），也是蛮长的：

static int __init_memblock memblock_double_array(struct memblock_type *type,
						phys_addr_t new_area_start,
						phys_addr_t new_area_size)
{
	struct memblock_region *new_array, *old_array;
	phys_addr_t old_alloc_size, new_alloc_size;
	phys_addr_t old_size, new_size, addr, new_end;
	// 这里确定是否可以使用slab？依据是什么？
	// 确定一个全局变量的状态
	// 在parse_dtb()调用时，该use_slab为false
	int use_slab = slab_is_available();
	int *in_slab;

	/* We don't allow resizing until we know about the reserved regions
	 * of memory that aren't suitable for allocation
	 */
	if (!memblock_can_resize)
		return -1;

	/* Calculate new doubled size */
	// 直接增大1倍
	old_size = type->max * sizeof(struct memblock_region);
	new_size = old_size << 1;
	/*
	 * We need to allocated new one align to PAGE_SIZE,
	 *   so we can free them completely later.
	 */
	// 按页对齐
	old_alloc_size = PAGE_ALIGN(old_size);
	new_alloc_size = PAGE_ALIGN(new_size);

	/* Retrieve the slab flag */
	if (type == &memblock.memory)
		in_slab = &memblock_memory_in_slab;
	else
		in_slab = &memblock_reserved_in_slab;

	/* Try to find some space for it */
	// 如果slab可用，则直接使用kmalloc调用
	if (use_slab) {
		new_array = kmalloc(new_size, GFP_KERNEL);
		addr = new_array ? __pa(new_array) : 0;
	} else {
		/* only exclude range when trying to double reserved.regions */
		if (type != &memblock.reserved)
			new_area_start = new_area_size = 0;

		// memory类型start为0，end为整个region的限制，
		// 如果size不大于end-start，则申请整个region空间
		// reserved类型为原结束地址，end为region的限制
		addr = memblock_find_in_range(new_area_start + new_area_size,
						memblock.current_limit,
						new_alloc_size, PAGE_SIZE);
		// 如果前面没有申请成功且是reserved类型的，则重新申请
		if (!addr && new_area_size)
			addr = memblock_find_in_range(0,
				min(new_area_start, memblock.current_limit),
				new_alloc_size, PAGE_SIZE);

		new_array = addr ? __va(addr) : NULL;
	}
	if (!addr) {
		pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
		       type->name, type->max, type->max * 2);
		return -1;
	}

	new_end = addr + new_size - 1;
	memblock_dbg("memblock: %s is doubled to %ld at [%pa-%pa]",
			type->name, type->max * 2, &addr, &new_end);

	/*
	 * Found space, we now need to move the array over before we add the
	 * reserved region since it may be our reserved array itself that is
	 * full.
	 */
	memcpy(new_array, type->regions, old_size);
	memset(new_array + type->max, 0, old_size);
	old_array = type->regions;
	type->regions = new_array;
	type->max <<= 1;

	/* Free old array. We needn't free it if the array is the static one */
	if (*in_slab)
		kfree(old_array);
	else if (old_array != memblock_memory_init_regions &&
		 old_array != memblock_reserved_init_regions)
		// 核心就是从reseved链表中移除
		memblock_free(old_array, old_alloc_size);

	/*
	 * Reserve the new array if that comes from the memblock.  Otherwise, we
	 * needn't do it
	 */
	// 这里的代码会让申请的memblock加入resved链表，不再被复用
	if (!use_slab)
		BUG_ON(memblock_reserve(addr, new_alloc_size));

	/* Update slab flag */
	*in_slab = use_slab;

	return 0;
}

在mm/memblock.c中（memblock_memory_in_slab以及memblock_reserved_in_slab定义如下）：

（也就是说，这两个值的初值为0）

static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock  = 0;

看上面的代码，核心的分配函数应该为memblock_find_in_range：

static phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
					phys_addr_t end, phys_addr_t size,
					phys_addr_t align)
{
	phys_addr_t ret;
	enum memblock_flags flags = choose_memblock_flags();

again:
	ret = memblock_find_in_range_node(size, align, start, end,
					    NUMA_NO_NODE, flags);

	if (!ret && (flags & MEMBLOCK_MIRROR)) {
		pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n",
			&size);
		flags &= ~MEMBLOCK_MIRROR;
		goto again;
	}

	return ret;
}

继续调用，一直调用到__memblock_find_range_bottom_up或者__memblock_find_range_top_down（依据memblock的bottom_up成员，该成员决定分配内存自定向下还是自下向上）。

static phys_addr_t __init_memblock
__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
				phys_addr_t size, phys_addr_t align, int nid,
				enum memblock_flags flags)
{
	phys_addr_t this_start, this_end, cand;
	u64 i;

	for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) {
		this_start = clamp(this_start, start, end);
		this_end = clamp(this_end, start, end);

		cand = round_up(this_start, align);
		if (cand < this_end && this_end - cand >= size)
			return cand;
	}

	return 0;
}

遍历range，这里使用的函数是for_each_free_mem_range：

#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid)	\
	__for_each_mem_range(i, &memblock.memory, &memblock.reserved,	\
			     nid, flags, p_start, p_end, p_nid)

#define __for_each_mem_range(i, type_a, type_b, nid, flags,		\
			   p_start, p_end, p_nid)			\
	for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b,	\
				     p_start, p_end, p_nid);		\
	     i != (u64)ULLONG_MAX;					\
	     __next_mem_range(&i, nid, flags, type_a, type_b,		\
			      p_start, p_end, p_nid))

for_each_free_mem_range这个函数是遍历memory的regions数组，排除其中属于reserved的regions数组内容。

void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags,
		      struct memblock_type *type_a,
		      struct memblock_type *type_b, phys_addr_t *out_start,
		      phys_addr_t *out_end, int *out_nid)
{
	int idx_a = *idx & 0xffffffff;
	int idx_b = *idx >> 32;

	if (WARN_ONCE(nid == MAX_NUMNODES,
	"Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
		nid = NUMA_NO_NODE;

	for (; idx_a < type_a->cnt; idx_a++) {
		struct memblock_region *m = &type_a->regions[idx_a];

		phys_addr_t m_start = m->base;
		phys_addr_t m_end = m->base + m->size;
		int	    m_nid = memblock_get_region_node(m);

		if (should_skip_region(type_a, m, nid, flags))
			continue;

		if (!type_b) {
			if (out_start)
				*out_start = m_start;
			if (out_end)
				*out_end = m_end;
			if (out_nid)
				*out_nid = m_nid;
			idx_a++;
			*idx = (u32)idx_a | (u64)idx_b << 32;
			return;
		}

		/* scan areas before each reservation */
		for (; idx_b < type_b->cnt + 1; idx_b++) {
			struct memblock_region *r;
			phys_addr_t r_start;
			phys_addr_t r_end;

			r = &type_b->regions[idx_b];
			r_start = idx_b ? r[-1].base + r[-1].size : 0;
			r_end = idx_b < type_b->cnt ?
				r->base : PHYS_ADDR_MAX;

			/*
			 * if idx_b advanced past idx_a,
			 * break out to advance idx_a
			 */
			if (r_start >= m_end)
				break;
			/* if the two regions intersect, we're done */
			if (m_start < r_end) {
				if (out_start)
					*out_start =
						max(m_start, r_start);
				if (out_end)
					*out_end = min(m_end, r_end);
				if (out_nid)
					*out_nid = m_nid;
				/*
				 * The region which ends first is
				 * advanced for the next iteration.
				 */
				if (m_end <= r_end)
					idx_a++;
				else
					idx_b++;
				*idx = (u32)idx_a | (u64)idx_b << 32;
				return;
			}
		}
	}

	/* signal end of iteration */
	*idx = ULLONG_MAX;
}

也就是说，现在实现了memblock数据的自举（刚开始的时候是一个静态数组，后来如果静态数组大小不够了，那么就需要从regions代表的内存中申请内存存放memblock的数组?）。interesting！！！

现在看一下memblock_alloc的代码：

-memblock_alloc
	-memblock_alloc_try_nid
		-memblock_alloc_internal
			-memblock_alloc_range_nid

phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
					phys_addr_t align, phys_addr_t start,
					phys_addr_t end, int nid,
					bool exact_nid)
{
	enum memblock_flags flags = choose_memblock_flags();
	phys_addr_t found;

	if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
		nid = NUMA_NO_NODE;

	if (!align) {
		/* Can't use WARNs this early in boot on powerpc */
		dump_stack();
		align = SMP_CACHE_BYTES;
	}

again:
	found = memblock_find_in_range_node(size, align, start, end, nid,
					    flags);
	if (found && !memblock_reserve(found, size))
		goto done;

	if (nid != NUMA_NO_NODE && !exact_nid) {
		found = memblock_find_in_range_node(size, align, start,
						    end, NUMA_NO_NODE,
						    flags);
		if (found && !memblock_reserve(found, size))
			goto done;
	}

	if (flags & MEMBLOCK_MIRROR) {
		flags &= ~MEMBLOCK_MIRROR;
		pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n",
			&size);
		goto again;
	}

	return 0;

done:
	/*
	 * Skip kmemleak for those places like kasan_init() and
	 * early_pgtable_alloc() due to high volume.
	 */
	if (end != MEMBLOCK_ALLOC_NOLEAKTRACE)
		/*
		 * Memblock allocated blocks are never reported as
		 * leaks. This is because many of these blocks are
		 * only referred via the physical address which is
		 * not looked up by kmemleak.
		 */
		kmemleak_alloc_phys(found, size, 0);

	/*
	 * Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP,
	 * require memory to be accepted before it can be used by the
	 * guest.
	 *
	 * Accept the memory of the allocated buffer.
	 */
	accept_memory(found, found + size);

	return found;
}

这里核心是调用memblock_find_in_range_node函数分配range，然后调用memblock_reserved函数将刚才申请到的memory region加入reserved中。

现在看完了memblock的组织以及申请内存的逻辑，下次分析memblock→伙伴系统。???

memblock

memblock section系统crash

memblock linux

526互联

memblock

memblock