vmalloc(一)
昨天同事遇到了vmalloc的问题,今天就抽时间看一下vmalloc的实现?。
第一部分:先分析了一下最简单的代码,如何申请vm_struct以及关联的vmap_area,以及申请要使用的页。
申请页后的映射留着后续具体分析。
直接从vmalloc
函数开始看起(mm/vmalloc.c):
void *vmalloc(unsigned long size)
{
return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc);
这里这个__builtin_return_address(x)
是蛮有意思的,其功能是返回调用当前的函数的x级回溯的函数返回地址,下次写个相关的编译器测例。
(TODO:编译器相关测例)
__vmalloc_node
函数:
void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, int node, const void *caller)
{
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
gfp_mask, PAGE_KERNEL, 0, node, caller);
}
(TODO:记得分析VMALLOC_START
和VMALLOC_END
是如何确定的。)
(PS:vmalloc
函数传递的时候start和end是VMALLOC_START
和VMALLOC_END
)
void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller)
{
struct vm_struct *area;
void *ret;
kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
unsigned long real_size = size;
unsigned long real_align = align;
unsigned int shift = PAGE_SHIFT;
if (WARN_ON_ONCE(!size))
return NULL;
// 检查:申请的页不要超过剩余的总页数
if ((size >> PAGE_SHIFT) > totalram_pages()) {
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, exceeds total pages",
real_size);
return NULL;
}
// 巨页部分:暂时不关注(主要调整shift,algin,size)
if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
unsigned long size_per_node;
/*
* Try huge pages. Only try for PAGE_KERNEL allocations,
* others like modules don't yet expect huge pages in
* their allocations due to apply_to_page_range not
* supporting them.
*/
size_per_node = size;
if (node == NUMA_NO_NODE)
size_per_node /= num_online_nodes();
if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
shift = PMD_SHIFT;
else
shift = arch_vmap_pte_supported_shift(size_per_node);
align = max(real_align, 1UL << shift);
size = ALIGN(real_size, 1UL << shift);
}
again:
// 分配vm_struct结构
area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
VM_UNINITIALIZED | vm_flags, start, end, node,
gfp_mask, caller);
if (!area) {
bool nofail = gfp_mask & __GFP_NOFAIL;
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, vm_struct allocation failed%s",
real_size, (nofail) ? ". Retrying." : "");
if (nofail) {
schedule_timeout_uninterruptible(1);
goto again;
}
goto fail;
}
/*
* Prepare arguments for __vmalloc_area_node() and
* kasan_unpoison_vmalloc().
*/
// kasan的部分暂时不关注
if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
if (kasan_hw_tags_enabled()) {
/*
* Modify protection bits to allow tagging.
* This must be done before mapping.
*/
prot = arch_vmap_pgprot_tagged(prot);
/*
* Skip page_alloc poisoning and zeroing for physical
* pages backing VM_ALLOC mapping. Memory is instead
* poisoned and zeroed by kasan_unpoison_vmalloc().
*/
gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
}
/* Take note that the mapping is PAGE_KERNEL. */
kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
}
/* Allocate physical pages and map them into vmalloc space. */
// 分配具体的物理空间
ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
if (!ret)
goto fail;
/*
* Mark the pages as accessible, now that they are mapped.
* The condition for setting KASAN_VMALLOC_INIT should complement the
* one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
* to make sure that memory is initialized under the same conditions.
* Tag-based KASAN modes only assign tags to normal non-executable
* allocations, see __kasan_unpoison_vmalloc().
*/
kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
(gfp_mask & __GFP_SKIP_ZERO))
kasan_flags |= KASAN_VMALLOC_INIT;
/* KASAN_VMALLOC_PROT_NORMAL already set if required. */
area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
* flag. It means that vm_struct is not fully initialized.
* Now, it is fully initialized, so remove this flag here.
*/
// 清除area的初始化flag
clear_vm_uninitialized_flag(area);
size = PAGE_ALIGN(size);
if (!(vm_flags & VM_DEFER_KMEMLEAK))
kmemleak_vmalloc(area, size, gfp_mask);
// 返回虚拟地址
return area->addr;
fail:
if (shift > PAGE_SHIFT) {
shift = PAGE_SHIFT;
align = real_align;
size = real_size;
goto again;
}
return NULL;
}
由该函数的返回值(return area→addr
)可知,vmalloc
申请的内存需要由struct vm_struct
进行管理。
struct vm_struct {
struct vm_struct *next; // 所有的vm_struct组成一个链表
void *addr; // addr代表vmalloc申请的空间的起始地址
unsigned long size; // 这片区域的大小
unsigned long flags; // 类型
struct page **pages; // page指针数组
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
unsigned int page_order;
#endif
unsigned int nr_pages; // page总数
phys_addr_t phys_addr; // 通常为0,ioremap时才使用
const void *caller; // 调用vmalloc的函数的返回地址
};
接下来分析__get_vm_area_node
函数,看下是如何分配struct vm_struct
结构的。
(此时传递的start
和end
,依旧是VMALLOC_START
和VMALLOC_END
)
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long shift, unsigned long flags,
unsigned long start, unsigned long end, int node,
gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
unsigned long requested_size = size;
// vmalloc不能在中断上下文调用
BUG_ON(in_interrupt());
// 向上对齐,确定页数
size = ALIGN(size, 1ul << shift);
if (unlikely(!size))
return NULL;
// 暂不分析ioremap
if (flags & VM_IOREMAP)
align = 1ul << clamp_t(int, get_count_order_long(size),
PAGE_SHIFT, IOREMAP_MAX_ORDER);
// 首先调用kzalloc_node接口分配area的空间
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!area))
return NULL;
// 这里增大了1页,是做隔离???
if (!(flags & VM_NO_GUARD))
size += PAGE_SIZE;
// 调用alloc_vmap_area分配struct vmap_area的空间
va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
// 涉及到kasan的部分,暂不关注
kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
// 这里将area与va进行了关联
setup_vmalloc_vm(area, va, flags, caller);
return area;
}
这里,首先出现了一个struct vmap_area
结构体。
struct vmap_area {
unsigned long va_start; // 虚拟地址的起始地址
unsigned long va_end; // 虚拟地址的结束地址
struct rb_node rb_node; /* address sorted rbtree */
struct list_head list; /* address sorted list */
/*
* The following two variables can be packed, because
* a vmap_area object can be either:
* 1) in "free" tree (root is vmap_area_root)
* 2) or "busy" tree (root is free_vmap_area_root)
*/
// 这里是个联合体,分别在free和busy的时候使用
union {
unsigned long subtree_max_size; /* in "free" tree */
struct vm_struct *vm; /* in "busy" tree */
};
};
此struct vmap_area
结构体是调用alloc_vmap_area
进行分配的。
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask)
{
struct vmap_area *va;
unsigned long freed;
unsigned long addr;
int purged = 0;
int ret;
BUG_ON(!size);
BUG_ON(offset_in_page(size));
BUG_ON(!is_power_of_2(align));
if (unlikely(!vmap_initialized))
return ERR_PTR(-EBUSY);
might_sleep();
gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
// 使用slab接口分配va的空间
va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
/*
* Only scan the relevant parts containing pointers to other objects
* to avoid false negatives.
*/
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
retry:
preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
// 核心是这里的__alloc_vmap_area函数,分配虚拟内存
addr = __alloc_vmap_area(size, align, vstart, vend);
spin_unlock(&free_vmap_area_lock);
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
*/
// 一旦==vend就代表出错
if (unlikely(addr == vend))
goto overflow;
// 这里填充va的内容,其中addr是__alloc_vmap_area函数分配的
va->va_start = addr;
va->va_end = addr + size;
va->vm = NULL;
spin_lock(&vmap_area_lock);
// 这里将新的vmap_area插入
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
spin_unlock(&vmap_area_lock);
BUG_ON(!IS_ALIGNED(va->va_start, align));
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);
ret = kasan_populate_vmalloc(addr, size);
if (ret) {
free_vmap_area(va);
return ERR_PTR(ret);
}
return va;
overflow:
if (!purged) {
purge_vmap_area_lazy();
purged = 1;
goto retry;
}
freed = 0;
blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
if (freed > 0) {
purged = 0;
goto retry;
}
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
size);
kmem_cache_free(vmap_area_cachep, va);
return ERR_PTR(-EBUSY);
}
__alloc_vmap_area
函数:
(此处,vstart
和vend
依旧是VMALLOC_START
和VMALLOC_END
)
static __always_inline unsigned long
__alloc_vmap_area(unsigned long size, unsigned long align,
unsigned long vstart, unsigned long vend)
{
unsigned long nva_start_addr;
struct vmap_area *va;
enum fit_type type;
int ret;
// 按照树:左、中、右的顺序找到匹配的vmap_area
va = find_vmap_lowest_match(size, align, vstart);
if (unlikely(!va))
return vend;
// 调整起始地址,就是选择起始地址偏大的部分
if (va->va_start > vstart)
nva_start_addr = ALIGN(va->va_start, align);
else
nva_start_addr = ALIGN(vstart, align);
/* Check the "vend" restriction. */
// 不能超过虚拟地址的限制
if (nva_start_addr + size > vend)
return vend;
/* Classify what we have found. */
// 分类
type = classify_va_fit_type(va, nva_start_addr, size);
if (WARN_ON_ONCE(type == NOTHING_FIT))
return vend;
/* Update the free vmap_area. */
// 更新
ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
if (ret)
return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
find_vmap_lowest_match_check(size);
#endif
return nva_start_addr;
}
首先调用了find_vmap_lowest_match
函数,寻找最合适的struct vmap_area
。
static __always_inline struct vmap_area *
find_vmap_lowest_match(unsigned long size,
unsigned long align, unsigned long vstart)
{
struct vmap_area *va;
struct rb_node *node;
unsigned long length;
/* Start from the root. */
node = free_vmap_area_root.rb_node;
/* Adjust the search size for alignment overhead. */
length = size + align - 1;
while (node) {
va = rb_entry(node, struct vmap_area, rb_node);
// 如果该节点的左子树的最大空闲大小大于当前申请长度
// 并且该节点的起始地址大于当前申请的起始地址
// 则转向左子树
if (get_subtree_max_size(node->rb_left) >= length &&
vstart < va->va_start) {
node = node->rb_left;
} else {
// 满足条件就退出
if (is_within_this_va(va, size, align, vstart))
return va;
/*
* Does not make sense to go deeper towards the right
* sub-tree if it does not have a free block that is
* equal or bigger to the requested search length.
*/
// 不满足条件就找右子树
if (get_subtree_max_size(node->rb_right) >= length) {
node = node->rb_right;
continue;
}
/*
* OK. We roll back and find the first right sub-tree,
* that will satisfy the search criteria. It can happen
* only once due to "vstart" restriction.
*/
// 都不满足就回溯
while ((node = rb_parent(node))) {
va = rb_entry(node, struct vmap_area, rb_node);
if (is_within_this_va(va, size, align, vstart))
return va;
// 回溯后的右子树
if (get_subtree_max_size(node->rb_right) >= length &&
vstart <= va->va_start) {
node = node->rb_right;
break;
}
}
}
}
return NULL;
}
分类classify_va_fit_type
:(va
是遍历红黑树search到的va,nva_start_addr
是调整后的起始地址,size
是申请的大小)
static __always_inline enum fit_type
classify_va_fit_type(struct vmap_area *va,
unsigned long nva_start_addr, unsigned long size)
{
enum fit_type type;
/* Check if it is within VA. */
// 不匹配
if (nva_start_addr < va->va_start ||
nva_start_addr + size > va->va_end)
return NOTHING_FIT;
/* Now classify. */
if (va->va_start == nva_start_addr) {
if (va->va_end == nva_start_addr + size)
// 完全符合
type = FL_FIT_TYPE;
else
// 左边贴合
type = LE_FIT_TYPE;
} else if (va->va_end == nva_start_addr + size) {
// 右边贴合
type = RE_FIT_TYPE;
} else {
// 两边都不贴合
type = NE_FIT_TYPE;
}
return type;
}
对申请到的va与申请的(起始地址,起始地址+size)进行比较,可分为不匹配,完全贴合,左边贴合,右边贴合,两边都不贴合这几种情况。
更新adjust_va_to_fit_type
:
static __always_inline int
adjust_va_to_fit_type(struct vmap_area *va,
unsigned long nva_start_addr, unsigned long size,
enum fit_type type)
{
struct vmap_area *lva = NULL;
// 完全贴合
if (type == FL_FIT_TYPE) {
/*
* No need to split VA, it fully fits.
*
* | |
* V NVA V
* |---------------|
*/
// 将va从free中去掉
// 然后从vmap_area_cachep中去掉
unlink_va(va, &free_vmap_area_root);
kmem_cache_free(vmap_area_cachep, va);
// 左边贴合
} else if (type == LE_FIT_TYPE) {
/*
* Split left edge of fit VA.
*
* | |
* V NVA V R
* |-------|-------|
*/
// 调整va的左起始地址
va->va_start += size;
// 右边贴合
} else if (type == RE_FIT_TYPE) {
/*
* Split right edge of fit VA.
*
* | |
* L V NVA V
* |-------|-------|
*/
// 调整va的右起始地址
va->va_end = nva_start_addr;
// 两边都不贴合
} else if (type == NE_FIT_TYPE) {
/*
* Split no edge of fit VA.
*
* | |
* L V NVA V R
* |---|-------|---|
*/
lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
if (unlikely(!lva)) {
/*
* For percpu allocator we do not do any pre-allocation
* and leave it as it is. The reason is it most likely
* never ends up with NE_FIT_TYPE splitting. In case of
* percpu allocations offsets and sizes are aligned to
* fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
* are its main fitting cases.
*
* There are a few exceptions though, as an example it is
* a first allocation (early boot up) when we have "one"
* big free space that has to be split.
*
* Also we can hit this path in case of regular "vmap"
* allocations, if "this" current CPU was not preloaded.
* See the comment in alloc_vmap_area() why. If so, then
* GFP_NOWAIT is used instead to get an extra object for
* split purpose. That is rare and most time does not
* occur.
*
* What happens if an allocation gets failed. Basically,
* an "overflow" path is triggered to purge lazily freed
* areas to free some memory, then, the "retry" path is
* triggered to repeat one more time. See more details
* in alloc_vmap_area() function.
*/
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
if (!lva)
return -1;
}
/*
* Build the remainder.
*/
// 新建一个lva,记录左边空闲
lva->va_start = va->va_start;
lva->va_end = nva_start_addr;
/*
* Shrink this VA to remaining size.
*/
// 将当前的va修改起始地址
va->va_start = nva_start_addr + size;
} else {
return -1;
}
if (type != FL_FIT_TYPE) {
// 计算maxsize(从底部算起)
augment_tree_propagate_from(va);
// 插入lva
if (lva) /* type == NE_FIT_TYPE */
insert_vmap_area_augment(lva, &va->rb_node,
&free_vmap_area_root, &free_vmap_area_list);
}
return 0;
}
insert_vmap_area
函数将新的struct vmap_area
插入:
static void
insert_vmap_area(struct vmap_area *va,
struct rb_root *root, struct list_head *head)
{
struct rb_node **link;
struct rb_node *parent;
// 寻找插入位置
link = find_va_links(va, root, NULL, &parent);
if (link)
// 进行实际的插入
link_va(va, root, parent, link, head);
}
find_va_links
函数寻找插入位置:
static __always_inline struct rb_node **
find_va_links(struct vmap_area *va,
struct rb_root *root, struct rb_node *from,
struct rb_node **parent)
{
struct vmap_area *tmp_va;
struct rb_node **link;
// 确定根节点
if (root) {
link = &root->rb_node;
if (unlikely(!*link)) {
*parent = NULL;
return link;
}
} else {
link = &from;
}
/*
* Go to the bottom of the tree. When we hit the last point
* we end up with parent rb_node and correct direction, i name
* it link, where the new va->rb_node will be attached to.
*/
// 遍历
do {
tmp_va = rb_entry(*link, struct vmap_area, rb_node);
/*
* During the traversal we also do some sanity check.
* Trigger the BUG() if there are sides(left/right)
* or full overlaps.
*/
// 如果va的起始地址 < 当前节点的结束 && va的结束地址 <= 当前节点的起始
if (va->va_start < tmp_va->va_end &&
va->va_end <= tmp_va->va_start)
// 进入左子树
link = &(*link)->rb_left;
// 如果va的结束地址 > 当前节点的起始 && va的起始地址 >= 当前节点的结束
else if (va->va_end > tmp_va->va_start &&
va->va_start >= tmp_va->va_end)
// 进入右子树
link = &(*link)->rb_right;
else {
WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
return NULL;
}
} while (*link);
// 确定parent
*parent = &tmp_va->rb_node;
return link;
}
link_va
进行实际的插入:
static __always_inline void
link_va(struct vmap_area *va, struct rb_root *root,
struct rb_node *parent, struct rb_node **link, struct list_head *head)
{
/*
* VA is still not in the list, but we can
* identify its future previous list_head node.
*/
if (likely(parent)) {
// 父节点的list
head = &rb_entry(parent, struct vmap_area, rb_node)->list;
// 依据左右确定head的位置
if (&parent->rb_right != link)
head = head->prev;
}
/* Insert to the rb-tree */
// 插入
rb_link_node(&va->rb_node, parent, link);
if (root == &free_vmap_area_root) {
/*
* Some explanation here. Just perform simple insertion
* to the tree. We do not set va->subtree_max_size to
* its current size before calling rb_insert_augmented().
* It is because of we populate the tree from the bottom
* to parent levels when the node _is_ in the tree.
*
* Therefore we set subtree_max_size to zero after insertion,
* to let __augment_tree_propagate_from() puts everything to
* the correct order later on.
*/
// 调用rb_insert_agumented,变颜色
rb_insert_augmented(&va->rb_node,
root, &free_vmap_area_rb_augment_cb);
va->subtree_max_size = 0;
} else {
rb_insert_color(&va->rb_node, root);
}
/* Address-sort this list */
list_add(&va->list, head);
}
__vmalloc_area_node
函数分配实际的物理内存:
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, unsigned int page_shift,
int node)
{
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
unsigned long addr = (unsigned long)area->addr;
// 这里返回的是实际的size,因为有可能后续延伸了1页
unsigned long size = get_vm_area_size(area);
unsigned long array_size;
// 确定页数
unsigned int nr_small_pages = size >> PAGE_SHIFT;
unsigned int page_order;
// 确定页指针数组大小
array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
gfp_mask |= __GFP_NOWARN;
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
// 自举了???
area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
area->caller);
} else {
// 小于1页则调用kmalloc
area->pages = kmalloc_node(array_size, nested_gfp, node);
}
if (!area->pages) {
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, failed to allocated page array size %lu",
nr_small_pages * PAGE_SIZE, array_size);
free_vm_area(area);
return NULL;
}
set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
page_order = vm_area_page_order(area);
// 分配页
area->nr_pages = vm_area_alloc_pages(gfp_mask, node,
page_order, nr_small_pages, area->pages);
// 分配的页数需要增加到nr_vmalloc_pages中去
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
/*
* If not enough pages were obtained to accomplish an
* allocation request, free them via __vfree() if any.
*/
if (area->nr_pages != nr_small_pages) {
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, page order %u, failed to allocate pages",
area->nr_pages * PAGE_SIZE, page_order);
goto fail;
}
if (vmap_pages_range(addr, addr + size, prot, area->pages,
page_shift) < 0) {
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, failed to map pages",
area->nr_pages * PAGE_SIZE);
goto fail;
}
return area->addr;
fail:
__vfree(area->addr);
return NULL;
}
分配页vm_area_alloc_pages
:
static inline unsigned int
vm_area_alloc_pages(gfp_t gfp, int nid,
unsigned int order, unsigned int nr_pages, struct page **pages)
{
unsigned int nr_allocated = 0;
struct page *page;
int i;
/*
* For order-0 pages we make use of bulk allocator, if
* the page array is partly or not at all populated due
* to fails, fallback to a single page allocator that is
* more permissive.
*/
if (!order && nid != NUMA_NO_NODE) {
while (nr_allocated < nr_pages) {
unsigned int nr, nr_pages_request;
/*
* A maximum allowed request is hard-coded and is 100
* pages per call. That is done in order to prevent a
* long preemption off scenario in the bulk-allocator
* so the range is [1:100].
*/
nr_pages_request = min(100U, nr_pages - nr_allocated);
nr = alloc_pages_bulk_array_node(gfp, nid,
nr_pages_request, pages + nr_allocated);
nr_allocated += nr;
cond_resched();
/*
* If zero or pages were obtained partly,
* fallback to a single page allocator.
*/
if (nr != nr_pages_request)
break;
}
} else if (order)
/*
* Compound pages required for remap_vmalloc_page if
* high-order pages.
*/
gfp |= __GFP_COMP;
/* High-order pages or fallback path if "bulk" fails. */
while (nr_allocated < nr_pages) {
if (nid == NUMA_NO_NODE)
// 实际分配页
page = alloc_pages(gfp, order);
else
page = alloc_pages_node(nid, gfp, order);
if (unlikely(!page))
break;
/*
* Careful, we allocate and map page-order pages, but
* tracking is done per PAGE_SIZE page so as to keep the
* vm_struct APIs independent of the physical/mapped size.
*/
for (i = 0; i < (1U << order); i++)
pages[nr_allocated + i] = page + i;
cond_resched();
nr_allocated += 1U << order;
}
return nr_allocated;
}
vmap_pages_range
函数:
static int vmap_pages_range(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
int err;
err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
flush_cache_vmap(addr, end);
return err;
}
vmap_pages_range_noflush
函数:
int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
WARN_ON(page_shift < PAGE_SHIFT);
if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
page_shift == PAGE_SHIFT)
return vmap_small_pages_range_noflush(addr, end, prot, pages);
for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
int err;
err = vmap_range_noflush(addr, addr + (1UL << page_shift),
__pa(page_address(pages[i])), prot,
page_shift);
if (err)
return err;
addr += 1UL << page_shift;
}
return 0;
}
vmap_range_noflush
函数建立页表:
static int vmap_range_noflush(unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift)
{
pgd_t *pgd;
unsigned long start;
unsigned long next;
int err;
pgtbl_mod_mask mask = 0;
might_sleep();
BUG_ON(addr >= end);
start = addr;
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
max_page_shift, &mask);
if (err)
break;
} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
return err;
}