memblock
前两天,看系统初始化时解析dtb的时候,发现首先会查找memory类型的节点并由memblock管理。
那么,现在就来分析一下memblock吧!
继续由dtb相关的接口开始分析,parse_dtb会查找到memory类型节点,确定memory节点对应的base以及size,最终会调用memblock_add函数。
memblock_add:
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
}
由参数可知,memblock是一个全局变量(是不是memblock最顶层由一个数据结构进行管理?)。
struct memblock {
bool bottom_up; /* is bottom up direction? */
phys_addr_t current_limit;
struct memblock_type memory;
struct memblock_type reserved;
};
extern struct memblock memblock;
struct memblock中的memory以及reserved成员都是struct memblock_type数据结构。
struct memblock_type {
unsigned long cnt;
unsigned long max;
phys_addr_t total_size;
struct memblock_region *regions;
char *name;
};
不难看出,memblock_type会管理一个一个不同的block(或者称为region),使用struct memblock_region的数组进行管理。cnt应该为region的总数。max为region的分配总数。(cnt ≤ max)。
struct memblock_region为一个单独的memory block。
struct memblock_region {
phys_addr_t base;
phys_addr_t size;
enum memblock_flags flags;
#ifdef CONFIG_NUMA
int nid;
#endif
};
从数据结构的关系上,不难看出,struct memblock是最顶层的数据结构,它有两个成员memory和reserved,分别代表正常的内存以及保留的内存,由数据结构struct memblock_type表示。struct memblock_type中有struct memblock_region的数组,用于表示具体的一个个block。
(还有一个问题,既然memblock是一个全局变量,那么它的值是在哪里进行初始化的呢?毕竟我们看到的时候就直接访问它的memory以及memory后续的region成员了!)
在/mm/memblock.c中:
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
struct memblock memblock __initdata_memblock = {
.memory.regions = memblock_memory_init_regions,
.memory.cnt = 1, /* empty dummy entry */
.memory.max = INIT_MEMBLOCK_MEMORY_REGIONS,
.memory.name = "memory",
.reserved.regions = memblock_reserved_init_regions,
.reserved.cnt = 1, /* empty dummy entry */
.reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS,
.reserved.name = "reserved",
.bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};
可以看到是静态分配了各种空间。
现在继续回到memblock_add_range函数。
(?蛮长的,慢慢看)
static int __init_memblock memblock_add_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size,
int nid, enum memblock_flags flags)
{
bool insert = false;
phys_addr_t obase = base;
// 这里确定end不超过最大值
phys_addr_t end = base + memblock_cap_size(base, &size);
int idx, nr_new, start_rgn = -1, end_rgn;
struct memblock_region *rgn;
if (!size)
return 0;
/* special case for empty array */
// regions数组为空的情况
// 直接填充第一个成员
if (type->regions[0].size == 0) {
WARN_ON(type->cnt != 1 || type->total_size);
type->regions[0].base = base;
type->regions[0].size = size;
type->regions[0].flags = flags;
memblock_set_region_node(&type->regions[0], nid);
type->total_size = size;
return 0;
}
/*
* The worst case is when new range overlaps all existing regions,
* then we'll need type->cnt + 1 empty regions in @type. So if
* type->cnt * 2 + 1 is less than or equal to type->max, we know
* that there is enough empty regions in @type, and we can insert
* regions directly.
*/
// 如果当前cnt * 2 + 1 <= max,则直接设置insert为true
// 也就是如果非这种情况,有可能region得扩展,重新申请空间
if (type->cnt * 2 + 1 <= type->max)
insert = true;
repeat:
/*
* The following is executed twice. Once with %false @insert and
* then with %true. The first counts the number of regions needed
* to accommodate the new area. The second actually inserts them.
*/
base = obase;
nr_new = 0;
for_each_memblock_type(idx, type, rgn) {
// 遍历memblock+_region
// rbase为当前region节点的base
// rend为当前region节点的end
phys_addr_t rbase = rgn->base;
phys_addr_t rend = rbase + rgn->size;
// (看来这里的regions也是按照地址从低到高的顺序排列的)
// 如果region的base大于当前要插入区域的end,则代表没有重叠,可以插入
if (rbase >= end)
break;
// 如果region的end小于当前要插入区域的base,则代表寻找下一个
if (rend <= base)
continue;
/*
* @rgn overlaps. If it separates the lower part of new
* area, insert that portion.
*/
// 此时已经确定重叠了,如果当前节点的base大于要插入的base,则代表重叠区域更大了
if (rbase > base) {
#ifdef CONFIG_NUMA
WARN_ON(nid != memblock_get_region_node(rgn));
#endif
WARN_ON(flags != rgn->flags);
nr_new++;
// 根据刚开始计算的insert确定要不要插入
if (insert) {
if (start_rgn == -1)
start_rgn = idx;
end_rgn = idx + 1;
memblock_insert_region(type, idx++, base,
rbase - base, nid,
flags);
}
}
/* area below @rend is dealt with, forget about it */
// 假如说,没有插入,base修改为rend和end中较前的那一个
base = min(rend, end);
}
/* insert the remaining portion */
// 如果base < end,则代表rend小于end,nr_new++
if (base < end) {
nr_new++;
if (insert) {
if (start_rgn == -1)
start_rgn = idx;
end_rgn = idx + 1;
memblock_insert_region(type, idx, base, end - base,
nid, flags);
}
}
if (!nr_new)
return 0;
/*
* If this was the first round, resize array and repeat for actual
* insertions; otherwise, merge and return.
*/
if (!insert) {
// 扩展区域
while (type->cnt + nr_new > type->max)
if (memblock_double_array(type, obase, size) < 0)
return -ENOMEM;
// 设置insert为true
insert = true;
goto repeat;
} else {
memblock_merge_regions(type, start_rgn, end_rgn);
return 0;
}
}
这里的代码和dtb创建device tree的代码很类似,第一次确定device tree需要分配的内存大小,第二次填充数据;而这里是如果第一次空间不是很足够,则第一次确定要增加多少,之后确定要不要增大空间,第二次再插入具体的region。
(存放memblock的内存是谁管的呢???)
下面先看一下memblock_double_array函数(这里负责延伸整个array空间),也是蛮长的:
static int __init_memblock memblock_double_array(struct memblock_type *type,
phys_addr_t new_area_start,
phys_addr_t new_area_size)
{
struct memblock_region *new_array, *old_array;
phys_addr_t old_alloc_size, new_alloc_size;
phys_addr_t old_size, new_size, addr, new_end;
// 这里确定是否可以使用slab?依据是什么?
// 确定一个全局变量的状态
// 在parse_dtb()调用时,该use_slab为false
int use_slab = slab_is_available();
int *in_slab;
/* We don't allow resizing until we know about the reserved regions
* of memory that aren't suitable for allocation
*/
if (!memblock_can_resize)
return -1;
/* Calculate new doubled size */
// 直接增大1倍
old_size = type->max * sizeof(struct memblock_region);
new_size = old_size << 1;
/*
* We need to allocated new one align to PAGE_SIZE,
* so we can free them completely later.
*/
// 按页对齐
old_alloc_size = PAGE_ALIGN(old_size);
new_alloc_size = PAGE_ALIGN(new_size);
/* Retrieve the slab flag */
if (type == &memblock.memory)
in_slab = &memblock_memory_in_slab;
else
in_slab = &memblock_reserved_in_slab;
/* Try to find some space for it */
// 如果slab可用,则直接使用kmalloc调用
if (use_slab) {
new_array = kmalloc(new_size, GFP_KERNEL);
addr = new_array ? __pa(new_array) : 0;
} else {
/* only exclude range when trying to double reserved.regions */
if (type != &memblock.reserved)
new_area_start = new_area_size = 0;
// memory类型start为0,end为整个region的限制,
// 如果size不大于end-start,则申请整个region空间
// reserved类型为原结束地址,end为region的限制
addr = memblock_find_in_range(new_area_start + new_area_size,
memblock.current_limit,
new_alloc_size, PAGE_SIZE);
// 如果前面没有申请成功且是reserved类型的,则重新申请
if (!addr && new_area_size)
addr = memblock_find_in_range(0,
min(new_area_start, memblock.current_limit),
new_alloc_size, PAGE_SIZE);
new_array = addr ? __va(addr) : NULL;
}
if (!addr) {
pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
type->name, type->max, type->max * 2);
return -1;
}
new_end = addr + new_size - 1;
memblock_dbg("memblock: %s is doubled to %ld at [%pa-%pa]",
type->name, type->max * 2, &addr, &new_end);
/*
* Found space, we now need to move the array over before we add the
* reserved region since it may be our reserved array itself that is
* full.
*/
memcpy(new_array, type->regions, old_size);
memset(new_array + type->max, 0, old_size);
old_array = type->regions;
type->regions = new_array;
type->max <<= 1;
/* Free old array. We needn't free it if the array is the static one */
if (*in_slab)
kfree(old_array);
else if (old_array != memblock_memory_init_regions &&
old_array != memblock_reserved_init_regions)
// 核心就是从reseved链表中移除
memblock_free(old_array, old_alloc_size);
/*
* Reserve the new array if that comes from the memblock. Otherwise, we
* needn't do it
*/
// 这里的代码会让申请的memblock加入resved链表,不再被复用
if (!use_slab)
BUG_ON(memblock_reserve(addr, new_alloc_size));
/* Update slab flag */
*in_slab = use_slab;
return 0;
}
在mm/memblock.c中(memblock_memory_in_slab以及memblock_reserved_in_slab定义如下):
(也就是说,这两个值的初值为0)
static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock = 0;
看上面的代码,核心的分配函数应该为memblock_find_in_range:
static phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align)
{
phys_addr_t ret;
enum memblock_flags flags = choose_memblock_flags();
again:
ret = memblock_find_in_range_node(size, align, start, end,
NUMA_NO_NODE, flags);
if (!ret && (flags & MEMBLOCK_MIRROR)) {
pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n",
&size);
flags &= ~MEMBLOCK_MIRROR;
goto again;
}
return ret;
}
继续调用,一直调用到__memblock_find_range_bottom_up或者__memblock_find_range_top_down(依据memblock的bottom_up成员,该成员决定分配内存自定向下还是自下向上)。
static phys_addr_t __init_memblock
__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align, int nid,
enum memblock_flags flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
cand = round_up(this_start, align);
if (cand < this_end && this_end - cand >= size)
return cand;
}
return 0;
}
遍历range,这里使用的函数是for_each_free_mem_range:
#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \
__for_each_mem_range(i, &memblock.memory, &memblock.reserved, \
nid, flags, p_start, p_end, p_nid)
#define __for_each_mem_range(i, type_a, type_b, nid, flags, \
p_start, p_end, p_nid) \
for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid); \
i != (u64)ULLONG_MAX; \
__next_mem_range(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid))
for_each_free_mem_range这个函数是遍历memory的regions数组,排除其中属于reserved的regions数组内容。
void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags,
struct memblock_type *type_a,
struct memblock_type *type_b, phys_addr_t *out_start,
phys_addr_t *out_end, int *out_nid)
{
int idx_a = *idx & 0xffffffff;
int idx_b = *idx >> 32;
if (WARN_ONCE(nid == MAX_NUMNODES,
"Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
nid = NUMA_NO_NODE;
for (; idx_a < type_a->cnt; idx_a++) {
struct memblock_region *m = &type_a->regions[idx_a];
phys_addr_t m_start = m->base;
phys_addr_t m_end = m->base + m->size;
int m_nid = memblock_get_region_node(m);
if (should_skip_region(type_a, m, nid, flags))
continue;
if (!type_b) {
if (out_start)
*out_start = m_start;
if (out_end)
*out_end = m_end;
if (out_nid)
*out_nid = m_nid;
idx_a++;
*idx = (u32)idx_a | (u64)idx_b << 32;
return;
}
/* scan areas before each reservation */
for (; idx_b < type_b->cnt + 1; idx_b++) {
struct memblock_region *r;
phys_addr_t r_start;
phys_addr_t r_end;
r = &type_b->regions[idx_b];
r_start = idx_b ? r[-1].base + r[-1].size : 0;
r_end = idx_b < type_b->cnt ?
r->base : PHYS_ADDR_MAX;
/*
* if idx_b advanced past idx_a,
* break out to advance idx_a
*/
if (r_start >= m_end)
break;
/* if the two regions intersect, we're done */
if (m_start < r_end) {
if (out_start)
*out_start =
max(m_start, r_start);
if (out_end)
*out_end = min(m_end, r_end);
if (out_nid)
*out_nid = m_nid;
/*
* The region which ends first is
* advanced for the next iteration.
*/
if (m_end <= r_end)
idx_a++;
else
idx_b++;
*idx = (u32)idx_a | (u64)idx_b << 32;
return;
}
}
}
/* signal end of iteration */
*idx = ULLONG_MAX;
}
也就是说,现在实现了memblock数据的自举(刚开始的时候是一个静态数组,后来如果静态数组大小不够了,那么就需要从regions代表的内存中申请内存存放memblock的数组?)。interesting!!!
现在看一下memblock_alloc的代码:
-memblock_alloc
-memblock_alloc_try_nid
-memblock_alloc_internal
-memblock_alloc_range_nid
phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid,
bool exact_nid)
{
enum memblock_flags flags = choose_memblock_flags();
phys_addr_t found;
if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
nid = NUMA_NO_NODE;
if (!align) {
/* Can't use WARNs this early in boot on powerpc */
dump_stack();
align = SMP_CACHE_BYTES;
}
again:
found = memblock_find_in_range_node(size, align, start, end, nid,
flags);
if (found && !memblock_reserve(found, size))
goto done;
if (nid != NUMA_NO_NODE && !exact_nid) {
found = memblock_find_in_range_node(size, align, start,
end, NUMA_NO_NODE,
flags);
if (found && !memblock_reserve(found, size))
goto done;
}
if (flags & MEMBLOCK_MIRROR) {
flags &= ~MEMBLOCK_MIRROR;
pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n",
&size);
goto again;
}
return 0;
done:
/*
* Skip kmemleak for those places like kasan_init() and
* early_pgtable_alloc() due to high volume.
*/
if (end != MEMBLOCK_ALLOC_NOLEAKTRACE)
/*
* Memblock allocated blocks are never reported as
* leaks. This is because many of these blocks are
* only referred via the physical address which is
* not looked up by kmemleak.
*/
kmemleak_alloc_phys(found, size, 0);
/*
* Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP,
* require memory to be accepted before it can be used by the
* guest.
*
* Accept the memory of the allocated buffer.
*/
accept_memory(found, found + size);
return found;
}
这里核心是调用memblock_find_in_range_node函数分配range,然后调用memblock_reserved函数将刚才申请到的memory region加入reserved中。
现在看完了memblock的组织以及申请内存的逻辑,下次分析memblock→伙伴系统。???