e820初始化相关暂时不看,只看伙伴系统、slab初始化相关。e820就是探测可用内存
1、初始化pglist_data
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
1.1、initmem_init(初始化每个node节点的管理内存大小)
arch -> x86 -> mm -> numa.c
void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
int acpi, int k8)
{
// 前后均删除代码,我们只关心k8 scan成功的情况
if (!numa_off && k8 && !k8_scan_nodes())
return;
}
// nodes_parsed 为初始化的时候探测到的node id,本质上是一个bitmap
// 探测到一个node id,则对应node id索引的字节为1
// 比如:nodes_parsed初始化为0000
// 探测到的node id为0/1/3,则nodes_parsed为0111
int __init k8_scan_nodes(void)
{
// acpi的相关逻辑先删除,不知道是啥玩意
int i;
node_possible_map = nodes_parsed;
// 计算memnode_shift逻辑先删除,不知道干啥用的
// 遍历node_possible_map,此处本质上就是nodes_parsed
for_each_node_mask(i, node_possible_map) {
// 通过和扫描到的node内存范围和e820实际内存范围做比对
// 将实际可用(非空洞)内存加入到early_node_map中
// early_node_map就是nid和起止页框号的映射关系
e820_register_active_regions(i,
nodes[i].start >> PAGE_SHIFT,
nodes[i].end >> PAGE_SHIFT);
// 设置每个node的bootmem
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
numa_init_array();
return 0;
}
1.2、setup_node_bootmem(设置node的bootmem)
arch -> x86 -> mm -> numa_64.c
// __pa把虚拟地址转换为物理地址
// __va把虚物理地址转换为虚拟地址
// reserve_early将已经分配的内存记录下来
void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
{
// 开始页框号/终止页框号/notedata物理地址
unsigned long start_pfn, last_pfn, nodedata_phys;
// 按照页大小取整获取pgdat_size
const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
// node id
int nid;
unsigned long bootmap_start, bootmap_pages, bootmap_size;
void *bootmap;
if (!end)
return;
/*
* Don't confuse VM with a node that doesn't have the
* minimum amount of memory:
*/
if (end && (end - start) < NODE_MIN_SIZE)
return;
// start按照能够整除最大order页面大小取整
start = roundup(start, ZONE_ALIGN);
start_pfn = start >> PAGE_SHIFT;
last_pfn = end >> PAGE_SHIFT;
// 给pgdata分配内存,静态分配的是指针
// 此处需要分配实际内存
node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
SMP_CACHE_BYTES);
if (node_data[nodeid] == NULL)
return;
// 获取分配地址的物理地址
nodedata_phys = __pa(node_data[nodeid]);
reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
// 根据物理地址获取node id
// nid可能会和node id不一样
// 因为会优先分配到node id为0的id上面
nid = phys_to_nid(nodedata_phys);
// 初始化pgdata
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
NODE_DATA(nodeid)->node_id = nodeid;
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
// 计算给定数目的页框需要多大的bitmap size
bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
// 将bootmap其实地址在node data之后按照page_size对齐
bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
// 给bootmap分配内存
bootmap = early_node_mem(nodeid, bootmap_start, end,
bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
if (bootmap == NULL) {
free_early(nodedata_phys, nodedata_phys + pgdat_size);
node_data[nodeid] = NULL;
return;
}
bootmap_start = __pa(bootmap);
reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
"BOOTMAP");
// 初始化bootmem
bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
bootmap_start >> PAGE_SHIFT,
start_pfn, last_pfn);
// 根据物理地址获取node id
// nid可能会和node id不一样
// 因为会优先分配到node id为0的id上面
nid = phys_to_nid(bootmap_start);
free_bootmem_with_active_regions(nodeid, end);
// 设置node online
node_set_online(nodeid);
}
// early_node_mem为早期分配内存的函数
// 本质上是基于e820进行的内存分配
// start/end为该node管理的内存范围
// size为要分配的大小
// align为对其大小
static void * __init early_node_mem(int nodeid, unsigned long start,
unsigned long end, unsigned long size,
unsigned long align)
{
unsigned long mem;
if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
start = MAX_DMA_PFN<<PAGE_SHIFT;
if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
end > (MAX_DMA32_PFN<<PAGE_SHIFT))
start = MAX_DMA32_PFN<<PAGE_SHIFT;
mem = find_e820_area(start, end, size, align);
if (mem != -1L)
return __va(mem);
end = max_pfn_mapped << PAGE_SHIFT;
if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
start = MAX_DMA32_PFN<<PAGE_SHIFT;
else
start = MAX_DMA_PFN<<PAGE_SHIFT;
mem = find_e820_area(start, end, size, align);
if (mem != -1L)
return __va(mem);
return NULL;
}
1.3、init_bootmem_node(初始化bootmem核心函数)
/**
* init_bootmem_node - register a node as boot memory
* @pgdat: node to register
* @freepfn: pfn where the bitmap for this node is to be placed
* @startpfn: first pfn on the node
* @endpfn: first pfn after the node
*
* Returns the number of bytes needed to hold the bitmap for this node.
*/
unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
unsigned long startpfn, unsigned long endpfn)
{
return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
}
static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
unsigned long mapstart, unsigned long start, unsigned long end)
{
unsigned long mapsize;
// 本函数在bootmem模式下啥也不干
mminit_validate_memmodel_limits(&start, &end);
// 初始化bdata值
// node_bootmem_map为数组起始地址
bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
bdata->node_min_pfn = start;
bdata->node_low_pfn = end;
// 将bdata链接到bdata_list
link_bootmem(bdata);
mapsize = bootmap_bytes(end - start);
memset(bdata->node_bootmem_map, 0xff, mapsize);
return mapsize;
}
1.4、free_bootmem_with_active_regions(释放可用的page到bootmem)
void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
{
int i;
// 遍历给定nid的所有range
for_each_active_range_index_in_nid(i, nid) {
unsigned long size_pages = 0;
unsigned long end_pfn = early_node_map[i].end_pfn;
if (early_node_map[i].start_pfn >= max_low_pfn)
continue;
if (end_pfn > max_low_pfn)
end_pfn = max_low_pfn;
size_pages = end_pfn - early_node_map[i].start_pfn;
// 释放给定起始位置和大小的页面
free_bootmem_node(NODE_DATA(early_node_map[i].nid),
PFN_PHYS(early_node_map[i].start_pfn),
size_pages << PAGE_SHIFT);
}
}
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
unsigned long start, end;
kmemleak_free_part(__va(physaddr), size);
start = PFN_UP(physaddr);
end = PFN_DOWN(physaddr + size);
mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
}
2、初始化pg_data_t和zone相关数据结构
arch -> x86 -> mm -> int_64.c
void __init paging_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
max_zone_pfns[ZONE_NORMAL] = max_pfn;
// ??????,先待定
sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
// 此函数当前啥也不干,跳过
node_clear_state(0, N_NORMAL_MEMORY);
free_area_init_nodes(max_zone_pfns);
}
// max_zone_pfn是一个数组地址,代表每个zone最大的页框号
// {10, 20, 40} 则代表:
// ZONE_DMA最大10个页框
// ZONE_DMA32最大20个页框
// ZONE_NORMAL最大40个页框
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
{
unsigned long nid;
int i;
// 按照start_pfn给early_node_map排序
sort_node_map();
// 记录不同zone的边界
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
// find_min_pfn_with_active_regions 遍历所有的 early_node_map 元素
// 找到最小的start_pfn
arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
// max_zone_pfn[0] 为 MAX_DMA_PFN
arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
for (i = 1; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
arch_zone_lowest_possible_pfn[i] =
arch_zone_highest_possible_pfn[i-1];
arch_zone_highest_possible_pfn[i] =
max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
}
arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
// 经过上述代码,则arch_zone_lowest_possible_pfn/arch_zone_highest_possible_pfn
// 可以有以下情况
// arch_zone_lowest_possible_pfn {1, 4, 6}
// arch_zone_highest_possible_pfn {4, 6, 8}
// DMA pfn {1, 4}, DMA_32 pfn {4, 6}, NORMAL pfn {6, 8}
// 计算每个node movable 内存区间的起始 pfn
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes(zone_movable_pfn);
// print mem 代码删除
// debug用,跳过
mminit_verify_pageflags_layout();
// 设置最大的node id
setup_nr_node_ids();
// 遍历所有在线的node
// 进行zone的初始化
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid, NULL, find_min_pfn_for_node(nid), NULL);
if (pgdat->node_present_pages)
node_set_state(nid, N_HIGH_MEMORY);
check_for_regular_memory(pgdat);
}
}
void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long node_start_pfn, unsigned long *zholes_size)
{
pg_data_t *pgdat = NODE_DATA(nid);
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
// 计算所有页。包括包含hole和不包含hole
// 填充pgdat->node_spanned_pages和pgdat->node_present_pages
calculate_node_totalpages(pgdat, zones_size, zholes_size);
// 给node_mem_map分配内存,使用bootmem内存分配器
// 本质上是分配该node所有page的数组
alloc_node_mem_map(pgdat);
// 填充zone
free_area_init_core(pgdat, zones_size, zholes_size);
}
2.1、free_area_init_core(核心逻辑)
// 目前看zones_size/zholes_size均为NULL
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
{
enum zone_type j;
int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
// 初始化node_size_lock
pgdat_resize_init(pgdat);
pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
pgdat->kswapd_max_order = 0;
// cgroup先忽略
pgdat_page_cgroup_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, memmap_pages;
enum lru_list l;
// 分别计算该zone的所有页面和不包含hole的页面
// size指的是页框号数量
size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = size - zone_absent_pages_in_node(nid, j,
zholes_size);
// 计算所有page需要的内存大小
memmap_pages = PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
// 此处考虑了内存小的情况,我们此处不考虑,代码已删除
// 除去dma预留
if (j == 0 && realsize > dma_reserve) {
realsize -= dma_reserve;
}
// nr_kernel_pages/nr_all_pages分别++
if (!is_highmem_idx(j))
// 不配置highmem则内核可用页++
nr_kernel_pages/ += realsize;
nr_all_pages += realsize;
zone->spanned_pages = size;
zone->present_pages = realsize;
zone->node = nid;
// 超过这么多数量的可回收页面,才进行页面回收
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) / 100;
// 超过这么多可回收的slab页面才进行回收
zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
zone->name = zone_names[j];
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
// 内存热插拔相关
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
zone->prev_priority = DEF_PRIORITY;
// ??????
zone_pcp_init(zone);
// 初始化一坨坨变量
for_each_lru(l) {
INIT_LIST_HEAD(&zone->lru[l].list);
zone->reclaim_stat.nr_saved_scan[l] = 0;
}
zone->reclaim_stat.recent_rotated[0] = 0;
zone->reclaim_stat.recent_rotated[1] = 0;
zone->reclaim_stat.recent_scanned[0] = 0;
zone->reclaim_stat.recent_scanned[1] = 0;
zap_zone_vm_stats(zone);
zone->flags = 0;
if (!size)
continue;
// ??????
set_pageblock_order(pageblock_default_order());
setup_usemap(pgdat, zone, size);
// 初始化其实页框号和free_list链表
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
// 初始化page
memmap_init(size, nid, j, zone_start_pfn);
zone_start_pfn += size;
}
}
2.2、helper func
2.2.1、memmap_init_zone(初始化所有预留的页面)
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
{
struct page *page;
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
struct zone *z;
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
z = &NODE_DATA(nid)->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
* handed to this function. They do not
* exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
if (!early_pfn_valid(pfn))
continue;
if (!early_pfn_in_nid(pfn, nid))
continue;
}
page = pfn_to_page(pfn);
// 设置page flags,zone/node/section
set_page_links(page, zone, nid, pfn);
// 一些校验
mminit_verify_page_links(page, zone, nid, pfn);
// 设置page->_count为1
init_page_count(page);
// 设置page->mapcount为1
reset_page_mapcount(page);
SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
* to reserve their blocks rather than leaking throughout
* the address space during boot when many long-lived
* kernel allocations are made. Later some blocks near
* the start are marked MIGRATE_RESERVE by
* setup_zone_migrate_reserve()
*
* bitmap is created for zone's valid pfn range. but memmap
* can be created for invalid pages (for alignment)
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
*/
if ((z->zone_start_pfn <= pfn)
&& (pfn < z->zone_start_pfn + z->spanned_pages)
&& !(pfn & (pageblock_nr_pages - 1)))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
// 初始化lru
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if (!is_highmem_idx(zone))
set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
}
}
2.2.2、find_zone_movable_pfns_for_nodes(看不懂啊)
/*
* Find the PFN the Movable zone begins in each node. Kernel memory
* is spread evenly between nodes as long as the nodes have enough
* memory. When they don't, some nodes will have more kernelcore than
* others
*/
static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
{
int i, nid;
unsigned long usable_startpfn;
unsigned long kernelcore_node, kernelcore_remaining;
/* save the state before borrow the nodemask */
nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
/*
* If movablecore was specified, calculate what size of
* kernelcore that corresponds so that memory usable for
* any allocation type is evenly spread. If both kernelcore
* and movablecore are specified, then the value of kernelcore
* will be used for required_kernelcore if it's greater than
* what movablecore would have allowed.
*/
if (required_movablecore) {
unsigned long corepages;
/*
* Round-up so that ZONE_MOVABLE is at least as large as what
* was requested by the user
*/
required_movablecore =
roundup(required_movablecore, MAX_ORDER_NR_PAGES);
corepages = totalpages - required_movablecore;
required_kernelcore = max(required_kernelcore, corepages);
}
/* If kernelcore was not specified, there is no ZONE_MOVABLE */
if (!required_kernelcore)
goto out;
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
find_usable_zone_for_movable();
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
restart:
/* Spread kernelcore memory as evenly as possible throughout nodes */
kernelcore_node = required_kernelcore / usable_nodes;
for_each_node_state(nid, N_HIGH_MEMORY) {
/*
* Recalculate kernelcore_node if the division per node
* now exceeds what is necessary to satisfy the requested
* amount of memory for the kernel
*/
if (required_kernelcore < kernelcore_node)
kernelcore_node = required_kernelcore / usable_nodes;
/*
* As the map is walked, we track how much memory is usable
* by the kernel using kernelcore_remaining. When it is
* 0, the rest of the node is usable by ZONE_MOVABLE
*/
kernelcore_remaining = kernelcore_node;
/* Go through each range of PFNs within this node */
for_each_active_range_index_in_nid(i, nid) {
unsigned long start_pfn, end_pfn;
unsigned long size_pages;
start_pfn = max(early_node_map[i].start_pfn,
zone_movable_pfn[nid]);
end_pfn = early_node_map[i].end_pfn;
if (start_pfn >= end_pfn)
continue;
/* Account for what is only usable for kernelcore */
if (start_pfn < usable_startpfn) {
unsigned long kernel_pages;
kernel_pages = min(end_pfn, usable_startpfn)
- start_pfn;
kernelcore_remaining -= min(kernel_pages,
kernelcore_remaining);
required_kernelcore -= min(kernel_pages,
required_kernelcore);
/* Continue if range is now fully accounted */
if (end_pfn <= usable_startpfn) {
/*
* Push zone_movable_pfn to the end so
* that if we have to rebalance
* kernelcore across nodes, we will
* not double account here
*/
zone_movable_pfn[nid] = end_pfn;
continue;
}
start_pfn = usable_startpfn;
}
/*
* The usable PFN range for ZONE_MOVABLE is from
* start_pfn->end_pfn. Calculate size_pages as the
* number of pages used as kernelcore
*/
size_pages = end_pfn - start_pfn;
if (size_pages > kernelcore_remaining)
size_pages = kernelcore_remaining;
zone_movable_pfn[nid] = start_pfn + size_pages;
/*
* Some kernelcore has been met, update counts and
* break if the kernelcore for this node has been
* satisified
*/
required_kernelcore -= min(required_kernelcore,
size_pages);
kernelcore_remaining -= size_pages;
if (!kernelcore_remaining)
break;
}
}
/*
* If there is still required_kernelcore, we do another pass with one
* less node in the count. This will push zone_movable_pfn[nid] further
* along on the nodes that still have memory until kernelcore is
* satisified
*/
usable_nodes--;
if (usable_nodes && required_kernelcore > usable_nodes)
goto restart;
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
for (nid = 0; nid < MAX_NUMNODES; nid++)
zone_movable_pfn[nid] =
roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
out:
/* restore the node_state */
node_states[N_HIGH_MEMORY] = saved_node_state;
}
3、构建zonelist
void build_all_zonelists(void)
{
// 默认可以认为按照ZONELIST_ORDER_NODE来排序
set_zonelist_order();
if (system_state == SYSTEM_BOOTING) {
__build_all_zonelists(NULL);
mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
} else {
/* we have to stop all cpus to guarantee there is no user
of zonelist */
stop_machine(__build_all_zonelists, NULL, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
/*
* Disable grouping by mobility if the number of pages in the
* system is too low to allow the mechanism to work. It would be
* more accurate, but expensive to check per-zone. This check is
* made on memory-hotadd so a system can start with mobility
* disabled and enable it later
*/
if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
page_group_by_mobility_disabled = 1;
else
page_group_by_mobility_disabled = 0;
}
static int __build_all_zonelists(void *dummy)
{
int nid;
int cpu;
memset(node_load, 0, sizeof(node_load));
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
build_zonelists(pgdat);
build_zonelist_cache(pgdat);
}
/*
* Initialize the boot_pagesets that are going to be used
* for bootstrapping processors. The real pagesets for
* each zone will be allocated later when the per cpu
* allocator is available.
*
* boot_pagesets are used also for bootstrapping offline
* cpus if the system is already booted because the pagesets
* are needed to initialize allocators on a specific cpu too.
* F.e. the percpu allocator needs the page allocator which
* needs the percpu allocator in order to allocate its pagesets
* (a chicken-egg dilemma).
*/
for_each_possible_cpu(cpu)
setup_pageset(&per_cpu(boot_pageset, cpu), 0);
return 0;
}
3.1、helper func
3.1.1、build_zonelists
static int node_order[MAX_NUMNODES]
static void build_zonelists(pg_data_t *pgdat)
{
int j, node, load;
enum zone_type i;
nodemask_t used_mask;
int local_node, prev_node;
struct zonelist *zonelist;
int order = current_zonelist_order;
for (i = 0; i < MAX_ZONELISTS; i++) {
zonelist = pgdat->node_zonelists + i;
zonelist->_zonerefs[0].zone = NULL;
zonelist->_zonerefs[0].zone_idx = 0;
}
local_node = pgdat->node_id;
load = nr_online_nodes;
prev_node = local_node;
nodes_clear(used_mask);
memset(node_order, 0, sizeof(node_order));
j = 0;
// 依次寻找距离local_node最近的node
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
int distance = node_distance(local_node, node);
/*
* If another node is sufficiently far away then it is better
* to reclaim pages in a zone before going off node.
*/
if (distance > RECLAIM_DISTANCE)
zone_reclaim_mode = 1;
/*
* We don't want to pressure a particular node.
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
if (distance != node_distance(local_node, prev_node))
node_load[node] = load;
prev_node = node;
load--;
if (order == ZONELIST_ORDER_NODE)
build_zonelists_in_node_order(pgdat, node);
else
node_order[j++] = node; /* remember order */
}
if (order == ZONELIST_ORDER_ZONE) {
/* calculate node order -- i.e., DMA last! */
build_zonelists_in_zone_order(pgdat, j);
}
build_thisnode_zonelists(pgdat);
}
/*
* Build zonelists ordered by node and zones within node.
* This results in maximum locality--normal zone overflows into local
* DMA zone, if any--but risks exhausting DMA zone.
*/
static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
{
int j;
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[0];
for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
;
j = build_zonelists_node(NODE_DATA(node), zonelist, j,
MAX_NR_ZONES - 1);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
}
/*
* Build gfp_thisnode zonelists
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
int j;
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[1];
j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
}
/**
* find_next_best_node - find the next node that should appear in a given node's fallback list
* @node: node whose fallback list we're appending
* @used_node_mask: nodemask_t of already used nodes
*
* We use a number of factors to determine which is the next node that should
* appear on a given node's fallback list. The node should not have appeared
* already in @node's fallback list, and it should be the next closest node
* according to the distance array (which contains arbitrary distance values
* from each node to each node in the system), and should also prefer nodes
* with no CPUs, since presumably they'll have very little allocation pressure
* on them otherwise.
* It returns -1 if no node is found.
*/
static int find_next_best_node(int node, nodemask_t *used_node_mask)
{
int n, val;
int min_val = INT_MAX;
int best_node = -1;
const struct cpumask *tmp = cpumask_of_node(0);
// 先返回local _node
if (!node_isset(node, *used_node_mask)) {
node_set(node, *used_node_mask);
return node;
}
for_each_node_state(n, N_HIGH_MEMORY) {
// 设置过则不再设置
if (node_isset(n, *used_node_mask))
continue;
/* Use the distance array to find the distance */
val = node_distance(node, n);
/* Penalize nodes under us ("prefer the next node") */
val += (n < node);
/* Give preference to headless and unused nodes */
tmp = cpumask_of_node(n);
if (!cpumask_empty(tmp))
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
val *= (MAX_NODE_LOAD*MAX_NUMNODES);
val += node_load[n];
if (val < min_val) {
min_val = val;
best_node = n;
}
}
if (best_node >= 0)
node_set(best_node, *used_node_mask);
return best_node;
}
4、初始化伙伴系统
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
int aligned;
struct page *page;
unsigned long start, end, pages, count = 0;
if (!bdata->node_bootmem_map)
return 0;
start = bdata->node_min_pfn;
end = bdata->node_low_pfn;
aligned = !(start & (BITS_PER_LONG - 1));
while (start < end) {
unsigned long *map, idx, vec;
map = bdata->node_bootmem_map;
idx = start - bdata->node_min_pfn;
vec = ~map[idx / BITS_PER_LONG];
if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
int order = ilog2(BITS_PER_LONG);
__free_pages_bootmem(pfn_to_page(start), order);
count += BITS_PER_LONG;
} else {
unsigned long off = 0;
while (vec && off < BITS_PER_LONG) {
if (vec & 1) {
page = pfn_to_page(start + off);
__free_pages_bootmem(page, 0);
count++;
}
vec >>= 1;
off++;
}
}
start += BITS_PER_LONG;
}
page = virt_to_page(bdata->node_bootmem_map);
pages = bdata->node_low_pfn - bdata->node_min_pfn;
pages = bootmem_bootmap_pages(pages);
count += pages;
while (pages--)
__free_pages_bootmem(page++, 0);
return count;
}
void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
{
if (order == 0) {
__ClearPageReserved(page);
set_page_count(page, 0);
set_page_refcounted(page);
__free_page(page);
} else {
int loop;
prefetchw(page);
for (loop = 0; loop < BITS_PER_LONG; loop++) {
struct page *p = &page[loop];
if (loop + 1 < BITS_PER_LONG)
prefetchw(p + 1);
__ClearPageReserved(p);
set_page_count(p, 0);
}
set_page_refcounted(page);
__free_pages(page, order);
}
}
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0)
free_hot_cold_page(page, 0);
else
__free_pages_ok(page, order);
}
}
static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
int i;
int bad = 0;
int wasMlocked = __TestClearPageMlocked(page);
trace_mm_page_free_direct(page, order);
kmemcheck_free_shadow(page, order);
for (i = 0 ; i < (1 << order) ; ++i)
bad += free_pages_check(page + i);
if (bad)
return;
if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
arch_free_page(page, order);
kernel_map_pages(page, 1 << order, 0);
local_irq_save(flags);
if (unlikely(wasMlocked))
free_page_mlock(page);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, order,
get_pageblock_migratetype(page));
local_irq_restore(flags);
}
4.1、free_one_page(core func)
static void free_one_page(struct zone *zone, struct page *page, int order,
int migratetype)
{
spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
__free_one_page(page, zone, order, migratetype);
spin_unlock(&zone->lock);
}
/*
* Freeing function for a buddy system allocator.
*
* The concept of a buddy system is to maintain direct-mapped table
* (containing bit values) for memory blocks of various "orders".
* The bottom level table contains the map for the smallest allocatable
* units of memory (here, pages), and each level above it describes
* pairs of units from the levels below, hence, "buddies".
* At a high level, all that happens here is marking the table entry
* at the bottom level available, and propagating the changes upward
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
* free pages of length of (1 << order) and marked with PG_buddy. Page's
* order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
* If a block is freed, and its buddy is also free, then this
* triggers coalescing into a block of larger size.
*
* -- wli
*/
static inline void __free_one_page(struct page *page,
struct zone *zone, unsigned int order,
int migratetype)
{
unsigned long page_idx;
if (unlikely(PageCompound(page)))
if (unlikely(destroy_compound_page(page, order)))
return;
VM_BUG_ON(migratetype == -1);
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
VM_BUG_ON(page_idx & ((1 << order) - 1));
VM_BUG_ON(bad_range(zone, page));
while (order < MAX_ORDER-1) {
unsigned long combined_idx;
struct page *buddy;
buddy = __page_find_buddy(page, page_idx, order);
if (!page_is_buddy(page, buddy, order))
break;
/* Our buddy is free, merge with it and move up one order. */
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
combined_idx = __find_combined_index(page_idx, order);
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
order++;
}
set_page_order(page, order);
list_add(&page->lru,
&zone->free_area[order].free_list[migratetype]);
zone->free_area[order].nr_free++;
}