e820初始化相关暂时不看,只看伙伴系统、slab初始化相关。e820就是探测可用内存
1、初始化pglist_data
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
1.1、initmem_init(初始化每个node节点的管理内存大小)
arch -> x86 -> mm -> numa.c
void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,int acpi, int k8){// 前后均删除代码,我们只关心k8 scan成功的情况if (!numa_off && k8 && !k8_scan_nodes())return;}// nodes_parsed 为初始化的时候探测到的node id,本质上是一个bitmap// 探测到一个node id,则对应node id索引的字节为1// 比如:nodes_parsed初始化为0000// 探测到的node id为0/1/3,则nodes_parsed为0111int __init k8_scan_nodes(void){// acpi的相关逻辑先删除,不知道是啥玩意int i;node_possible_map = nodes_parsed;// 计算memnode_shift逻辑先删除,不知道干啥用的// 遍历node_possible_map,此处本质上就是nodes_parsedfor_each_node_mask(i, node_possible_map) {// 通过和扫描到的node内存范围和e820实际内存范围做比对// 将实际可用(非空洞)内存加入到early_node_map中// early_node_map就是nid和起止页框号的映射关系e820_register_active_regions(i,nodes[i].start >> PAGE_SHIFT,nodes[i].end >> PAGE_SHIFT);// 设置每个node的bootmemsetup_node_bootmem(i, nodes[i].start, nodes[i].end);}numa_init_array();return 0;}
1.2、setup_node_bootmem(设置node的bootmem)
arch -> x86 -> mm -> numa_64.c
// __pa把虚拟地址转换为物理地址// __va把虚物理地址转换为虚拟地址// reserve_early将已经分配的内存记录下来void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end){// 开始页框号/终止页框号/notedata物理地址unsigned long start_pfn, last_pfn, nodedata_phys;// 按照页大小取整获取pgdat_sizeconst int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);// node idint nid;unsigned long bootmap_start, bootmap_pages, bootmap_size;void *bootmap;if (!end)return;/** Don't confuse VM with a node that doesn't have the* minimum amount of memory:*/if (end && (end - start) < NODE_MIN_SIZE)return;// start按照能够整除最大order页面大小取整start = roundup(start, ZONE_ALIGN);start_pfn = start >> PAGE_SHIFT;last_pfn = end >> PAGE_SHIFT;// 给pgdata分配内存,静态分配的是指针// 此处需要分配实际内存node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,SMP_CACHE_BYTES);if (node_data[nodeid] == NULL)return;// 获取分配地址的物理地址nodedata_phys = __pa(node_data[nodeid]);reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");// 根据物理地址获取node id// nid可能会和node id不一样// 因为会优先分配到node id为0的id上面nid = phys_to_nid(nodedata_phys);// 初始化pgdatamemset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));NODE_DATA(nodeid)->node_id = nodeid;NODE_DATA(nodeid)->node_start_pfn = start_pfn;NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];// 计算给定数目的页框需要多大的bitmap sizebootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);// 将bootmap其实地址在node data之后按照page_size对齐bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);// 给bootmap分配内存bootmap = early_node_mem(nodeid, bootmap_start, end,bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);if (bootmap == NULL) {free_early(nodedata_phys, nodedata_phys + pgdat_size);node_data[nodeid] = NULL;return;}bootmap_start = __pa(bootmap);reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),"BOOTMAP");// 初始化bootmembootmap_size = init_bootmem_node(NODE_DATA(nodeid),bootmap_start >> PAGE_SHIFT,start_pfn, last_pfn);// 根据物理地址获取node id// nid可能会和node id不一样// 因为会优先分配到node id为0的id上面nid = phys_to_nid(bootmap_start);free_bootmem_with_active_regions(nodeid, end);// 设置node onlinenode_set_online(nodeid);}// early_node_mem为早期分配内存的函数// 本质上是基于e820进行的内存分配// start/end为该node管理的内存范围// size为要分配的大小// align为对其大小static void * __init early_node_mem(int nodeid, unsigned long start,unsigned long end, unsigned long size,unsigned long align){unsigned long mem;if (start < (MAX_DMA_PFN<<PAGE_SHIFT))start = MAX_DMA_PFN<<PAGE_SHIFT;if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&end > (MAX_DMA32_PFN<<PAGE_SHIFT))start = MAX_DMA32_PFN<<PAGE_SHIFT;mem = find_e820_area(start, end, size, align);if (mem != -1L)return __va(mem);end = max_pfn_mapped << PAGE_SHIFT;if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))start = MAX_DMA32_PFN<<PAGE_SHIFT;elsestart = MAX_DMA_PFN<<PAGE_SHIFT;mem = find_e820_area(start, end, size, align);if (mem != -1L)return __va(mem);return NULL;}
1.3、init_bootmem_node(初始化bootmem核心函数)
/*** init_bootmem_node - register a node as boot memory* @pgdat: node to register* @freepfn: pfn where the bitmap for this node is to be placed* @startpfn: first pfn on the node* @endpfn: first pfn after the node** Returns the number of bytes needed to hold the bitmap for this node.*/unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,unsigned long startpfn, unsigned long endpfn){return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);}static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,unsigned long mapstart, unsigned long start, unsigned long end){unsigned long mapsize;// 本函数在bootmem模式下啥也不干mminit_validate_memmodel_limits(&start, &end);// 初始化bdata值// node_bootmem_map为数组起始地址bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));bdata->node_min_pfn = start;bdata->node_low_pfn = end;// 将bdata链接到bdata_listlink_bootmem(bdata);mapsize = bootmap_bytes(end - start);memset(bdata->node_bootmem_map, 0xff, mapsize);return mapsize;}
1.4、free_bootmem_with_active_regions(释放可用的page到bootmem)
void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn){int i;// 遍历给定nid的所有rangefor_each_active_range_index_in_nid(i, nid) {unsigned long size_pages = 0;unsigned long end_pfn = early_node_map[i].end_pfn;if (early_node_map[i].start_pfn >= max_low_pfn)continue;if (end_pfn > max_low_pfn)end_pfn = max_low_pfn;size_pages = end_pfn - early_node_map[i].start_pfn;// 释放给定起始位置和大小的页面free_bootmem_node(NODE_DATA(early_node_map[i].nid),PFN_PHYS(early_node_map[i].start_pfn),size_pages << PAGE_SHIFT);}}void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,unsigned long size){unsigned long start, end;kmemleak_free_part(__va(physaddr), size);start = PFN_UP(physaddr);end = PFN_DOWN(physaddr + size);mark_bootmem_node(pgdat->bdata, start, end, 0, 0);}
2、初始化pg_data_t和zone相关数据结构
arch -> x86 -> mm -> int_64.c
void __init paging_init(void){unsigned long max_zone_pfns[MAX_NR_ZONES];memset(max_zone_pfns, 0, sizeof(max_zone_pfns));max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;max_zone_pfns[ZONE_NORMAL] = max_pfn;// ??????,先待定sparse_memory_present_with_active_regions(MAX_NUMNODES);sparse_init();// 此函数当前啥也不干,跳过node_clear_state(0, N_NORMAL_MEMORY);free_area_init_nodes(max_zone_pfns);}// max_zone_pfn是一个数组地址,代表每个zone最大的页框号// {10, 20, 40} 则代表:// ZONE_DMA最大10个页框// ZONE_DMA32最大20个页框// ZONE_NORMAL最大40个页框void __init free_area_init_nodes(unsigned long *max_zone_pfn){unsigned long nid;int i;// 按照start_pfn给early_node_map排序sort_node_map();// 记录不同zone的边界memset(arch_zone_lowest_possible_pfn, 0,sizeof(arch_zone_lowest_possible_pfn));memset(arch_zone_highest_possible_pfn, 0,sizeof(arch_zone_highest_possible_pfn));// find_min_pfn_with_active_regions 遍历所有的 early_node_map 元素// 找到最小的start_pfnarch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();// max_zone_pfn[0] 为 MAX_DMA_PFNarch_zone_highest_possible_pfn[0] = max_zone_pfn[0];for (i = 1; i < MAX_NR_ZONES; i++) {if (i == ZONE_MOVABLE)continue;arch_zone_lowest_possible_pfn[i] =arch_zone_highest_possible_pfn[i-1];arch_zone_highest_possible_pfn[i] =max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);}arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;// 经过上述代码,则arch_zone_lowest_possible_pfn/arch_zone_highest_possible_pfn// 可以有以下情况// arch_zone_lowest_possible_pfn {1, 4, 6}// arch_zone_highest_possible_pfn {4, 6, 8}// DMA pfn {1, 4}, DMA_32 pfn {4, 6}, NORMAL pfn {6, 8}// 计算每个node movable 内存区间的起始 pfnmemset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));find_zone_movable_pfns_for_nodes(zone_movable_pfn);// print mem 代码删除// debug用,跳过mminit_verify_pageflags_layout();// 设置最大的node idsetup_nr_node_ids();// 遍历所有在线的node// 进行zone的初始化for_each_online_node(nid) {pg_data_t *pgdat = NODE_DATA(nid);free_area_init_node(nid, NULL, find_min_pfn_for_node(nid), NULL);if (pgdat->node_present_pages)node_set_state(nid, N_HIGH_MEMORY);check_for_regular_memory(pgdat);}}void __paginginit free_area_init_node(int nid, unsigned long *zones_size,unsigned long node_start_pfn, unsigned long *zholes_size){pg_data_t *pgdat = NODE_DATA(nid);pgdat->node_id = nid;pgdat->node_start_pfn = node_start_pfn;// 计算所有页。包括包含hole和不包含hole// 填充pgdat->node_spanned_pages和pgdat->node_present_pagescalculate_node_totalpages(pgdat, zones_size, zholes_size);// 给node_mem_map分配内存,使用bootmem内存分配器// 本质上是分配该node所有page的数组alloc_node_mem_map(pgdat);// 填充zonefree_area_init_core(pgdat, zones_size, zholes_size);}
2.1、free_area_init_core(核心逻辑)
// 目前看zones_size/zholes_size均为NULLstatic void __paginginit free_area_init_core(struct pglist_data *pgdat,unsigned long *zones_size, unsigned long *zholes_size){enum zone_type j;int nid = pgdat->node_id;unsigned long zone_start_pfn = pgdat->node_start_pfn;int ret;// 初始化node_size_lockpgdat_resize_init(pgdat);pgdat->nr_zones = 0;init_waitqueue_head(&pgdat->kswapd_wait);pgdat->kswapd_max_order = 0;// cgroup先忽略pgdat_page_cgroup_init(pgdat);for (j = 0; j < MAX_NR_ZONES; j++) {struct zone *zone = pgdat->node_zones + j;unsigned long size, realsize, memmap_pages;enum lru_list l;// 分别计算该zone的所有页面和不包含hole的页面// size指的是页框号数量size = zone_spanned_pages_in_node(nid, j, zones_size);realsize = size - zone_absent_pages_in_node(nid, j,zholes_size);// 计算所有page需要的内存大小memmap_pages = PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;// 此处考虑了内存小的情况,我们此处不考虑,代码已删除// 除去dma预留if (j == 0 && realsize > dma_reserve) {realsize -= dma_reserve;}// nr_kernel_pages/nr_all_pages分别++if (!is_highmem_idx(j))// 不配置highmem则内核可用页++nr_kernel_pages/ += realsize;nr_all_pages += realsize;zone->spanned_pages = size;zone->present_pages = realsize;zone->node = nid;// 超过这么多数量的可回收页面,才进行页面回收zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) / 100;// 超过这么多可回收的slab页面才进行回收zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;zone->name = zone_names[j];spin_lock_init(&zone->lock);spin_lock_init(&zone->lru_lock);// 内存热插拔相关zone_seqlock_init(zone);zone->zone_pgdat = pgdat;zone->prev_priority = DEF_PRIORITY;// ??????zone_pcp_init(zone);// 初始化一坨坨变量for_each_lru(l) {INIT_LIST_HEAD(&zone->lru[l].list);zone->reclaim_stat.nr_saved_scan[l] = 0;}zone->reclaim_stat.recent_rotated[0] = 0;zone->reclaim_stat.recent_rotated[1] = 0;zone->reclaim_stat.recent_scanned[0] = 0;zone->reclaim_stat.recent_scanned[1] = 0;zap_zone_vm_stats(zone);zone->flags = 0;if (!size)continue;// ??????set_pageblock_order(pageblock_default_order());setup_usemap(pgdat, zone, size);// 初始化其实页框号和free_list链表ret = init_currently_empty_zone(zone, zone_start_pfn,size, MEMMAP_EARLY);// 初始化pagememmap_init(size, nid, j, zone_start_pfn);zone_start_pfn += size;}}
2.2、helper func
2.2.1、memmap_init_zone(初始化所有预留的页面)
/** Initially all pages are reserved - free ones are freed* up by free_all_bootmem() once the early boot process is* done. Non-atomic initialization, single-pass.*/void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,unsigned long start_pfn, enum memmap_context context){struct page *page;unsigned long end_pfn = start_pfn + size;unsigned long pfn;struct zone *z;if (highest_memmap_pfn < end_pfn - 1)highest_memmap_pfn = end_pfn - 1;z = &NODE_DATA(nid)->node_zones[zone];for (pfn = start_pfn; pfn < end_pfn; pfn++) {/** There can be holes in boot-time mem_map[]s* handed to this function. They do not* exist on hotplugged memory.*/if (context == MEMMAP_EARLY) {if (!early_pfn_valid(pfn))continue;if (!early_pfn_in_nid(pfn, nid))continue;}page = pfn_to_page(pfn);// 设置page flags,zone/node/sectionset_page_links(page, zone, nid, pfn);// 一些校验mminit_verify_page_links(page, zone, nid, pfn);// 设置page->_count为1init_page_count(page);// 设置page->mapcount为1reset_page_mapcount(page);SetPageReserved(page);/** Mark the block movable so that blocks are reserved for* movable at startup. This will force kernel allocations* to reserve their blocks rather than leaking throughout* the address space during boot when many long-lived* kernel allocations are made. Later some blocks near* the start are marked MIGRATE_RESERVE by* setup_zone_migrate_reserve()** bitmap is created for zone's valid pfn range. but memmap* can be created for invalid pages (for alignment)* check here not to call set_pageblock_migratetype() against* pfn out of zone.*/if ((z->zone_start_pfn <= pfn)&& (pfn < z->zone_start_pfn + z->spanned_pages)&& !(pfn & (pageblock_nr_pages - 1)))set_pageblock_migratetype(page, MIGRATE_MOVABLE);// 初始化lruINIT_LIST_HEAD(&page->lru);#ifdef WANT_PAGE_VIRTUAL/* The shift won't overflow because ZONE_NORMAL is below 4G. */if (!is_highmem_idx(zone))set_page_address(page, __va(pfn << PAGE_SHIFT));#endif}}
2.2.2、find_zone_movable_pfns_for_nodes(看不懂啊)
/** Find the PFN the Movable zone begins in each node. Kernel memory* is spread evenly between nodes as long as the nodes have enough* memory. When they don't, some nodes will have more kernelcore than* others*/static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn){int i, nid;unsigned long usable_startpfn;unsigned long kernelcore_node, kernelcore_remaining;/* save the state before borrow the nodemask */nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];unsigned long totalpages = early_calculate_totalpages();int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);/** If movablecore was specified, calculate what size of* kernelcore that corresponds so that memory usable for* any allocation type is evenly spread. If both kernelcore* and movablecore are specified, then the value of kernelcore* will be used for required_kernelcore if it's greater than* what movablecore would have allowed.*/if (required_movablecore) {unsigned long corepages;/** Round-up so that ZONE_MOVABLE is at least as large as what* was requested by the user*/required_movablecore =roundup(required_movablecore, MAX_ORDER_NR_PAGES);corepages = totalpages - required_movablecore;required_kernelcore = max(required_kernelcore, corepages);}/* If kernelcore was not specified, there is no ZONE_MOVABLE */if (!required_kernelcore)goto out;/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */find_usable_zone_for_movable();usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];restart:/* Spread kernelcore memory as evenly as possible throughout nodes */kernelcore_node = required_kernelcore / usable_nodes;for_each_node_state(nid, N_HIGH_MEMORY) {/** Recalculate kernelcore_node if the division per node* now exceeds what is necessary to satisfy the requested* amount of memory for the kernel*/if (required_kernelcore < kernelcore_node)kernelcore_node = required_kernelcore / usable_nodes;/** As the map is walked, we track how much memory is usable* by the kernel using kernelcore_remaining. When it is* 0, the rest of the node is usable by ZONE_MOVABLE*/kernelcore_remaining = kernelcore_node;/* Go through each range of PFNs within this node */for_each_active_range_index_in_nid(i, nid) {unsigned long start_pfn, end_pfn;unsigned long size_pages;start_pfn = max(early_node_map[i].start_pfn,zone_movable_pfn[nid]);end_pfn = early_node_map[i].end_pfn;if (start_pfn >= end_pfn)continue;/* Account for what is only usable for kernelcore */if (start_pfn < usable_startpfn) {unsigned long kernel_pages;kernel_pages = min(end_pfn, usable_startpfn)- start_pfn;kernelcore_remaining -= min(kernel_pages,kernelcore_remaining);required_kernelcore -= min(kernel_pages,required_kernelcore);/* Continue if range is now fully accounted */if (end_pfn <= usable_startpfn) {/** Push zone_movable_pfn to the end so* that if we have to rebalance* kernelcore across nodes, we will* not double account here*/zone_movable_pfn[nid] = end_pfn;continue;}start_pfn = usable_startpfn;}/** The usable PFN range for ZONE_MOVABLE is from* start_pfn->end_pfn. Calculate size_pages as the* number of pages used as kernelcore*/size_pages = end_pfn - start_pfn;if (size_pages > kernelcore_remaining)size_pages = kernelcore_remaining;zone_movable_pfn[nid] = start_pfn + size_pages;/** Some kernelcore has been met, update counts and* break if the kernelcore for this node has been* satisified*/required_kernelcore -= min(required_kernelcore,size_pages);kernelcore_remaining -= size_pages;if (!kernelcore_remaining)break;}}/** If there is still required_kernelcore, we do another pass with one* less node in the count. This will push zone_movable_pfn[nid] further* along on the nodes that still have memory until kernelcore is* satisified*/usable_nodes--;if (usable_nodes && required_kernelcore > usable_nodes)goto restart;/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */for (nid = 0; nid < MAX_NUMNODES; nid++)zone_movable_pfn[nid] =roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);out:/* restore the node_state */node_states[N_HIGH_MEMORY] = saved_node_state;}
3、构建zonelist
void build_all_zonelists(void){// 默认可以认为按照ZONELIST_ORDER_NODE来排序set_zonelist_order();if (system_state == SYSTEM_BOOTING) {__build_all_zonelists(NULL);mminit_verify_zonelist();cpuset_init_current_mems_allowed();} else {/* we have to stop all cpus to guarantee there is no userof zonelist */stop_machine(__build_all_zonelists, NULL, NULL);/* cpuset refresh routine should be here */}vm_total_pages = nr_free_pagecache_pages();/** Disable grouping by mobility if the number of pages in the* system is too low to allow the mechanism to work. It would be* more accurate, but expensive to check per-zone. This check is* made on memory-hotadd so a system can start with mobility* disabled and enable it later*/if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))page_group_by_mobility_disabled = 1;elsepage_group_by_mobility_disabled = 0;}static int __build_all_zonelists(void *dummy){int nid;int cpu;memset(node_load, 0, sizeof(node_load));for_each_online_node(nid) {pg_data_t *pgdat = NODE_DATA(nid);build_zonelists(pgdat);build_zonelist_cache(pgdat);}/** Initialize the boot_pagesets that are going to be used* for bootstrapping processors. The real pagesets for* each zone will be allocated later when the per cpu* allocator is available.** boot_pagesets are used also for bootstrapping offline* cpus if the system is already booted because the pagesets* are needed to initialize allocators on a specific cpu too.* F.e. the percpu allocator needs the page allocator which* needs the percpu allocator in order to allocate its pagesets* (a chicken-egg dilemma).*/for_each_possible_cpu(cpu)setup_pageset(&per_cpu(boot_pageset, cpu), 0);return 0;}
3.1、helper func
3.1.1、build_zonelists
static int node_order[MAX_NUMNODES]
static void build_zonelists(pg_data_t *pgdat){int j, node, load;enum zone_type i;nodemask_t used_mask;int local_node, prev_node;struct zonelist *zonelist;int order = current_zonelist_order;for (i = 0; i < MAX_ZONELISTS; i++) {zonelist = pgdat->node_zonelists + i;zonelist->_zonerefs[0].zone = NULL;zonelist->_zonerefs[0].zone_idx = 0;}local_node = pgdat->node_id;load = nr_online_nodes;prev_node = local_node;nodes_clear(used_mask);memset(node_order, 0, sizeof(node_order));j = 0;// 依次寻找距离local_node最近的nodewhile ((node = find_next_best_node(local_node, &used_mask)) >= 0) {int distance = node_distance(local_node, node);/** If another node is sufficiently far away then it is better* to reclaim pages in a zone before going off node.*/if (distance > RECLAIM_DISTANCE)zone_reclaim_mode = 1;/** We don't want to pressure a particular node.* So adding penalty to the first node in same* distance group to make it round-robin.*/if (distance != node_distance(local_node, prev_node))node_load[node] = load;prev_node = node;load--;if (order == ZONELIST_ORDER_NODE)build_zonelists_in_node_order(pgdat, node);elsenode_order[j++] = node; /* remember order */}if (order == ZONELIST_ORDER_ZONE) {/* calculate node order -- i.e., DMA last! */build_zonelists_in_zone_order(pgdat, j);}build_thisnode_zonelists(pgdat);}
/** Build zonelists ordered by node and zones within node.* This results in maximum locality--normal zone overflows into local* DMA zone, if any--but risks exhausting DMA zone.*/static void build_zonelists_in_node_order(pg_data_t *pgdat, int node){int j;struct zonelist *zonelist;zonelist = &pgdat->node_zonelists[0];for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++);j = build_zonelists_node(NODE_DATA(node), zonelist, j,MAX_NR_ZONES - 1);zonelist->_zonerefs[j].zone = NULL;zonelist->_zonerefs[j].zone_idx = 0;}
/** Build gfp_thisnode zonelists*/static void build_thisnode_zonelists(pg_data_t *pgdat){int j;struct zonelist *zonelist;zonelist = &pgdat->node_zonelists[1];j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);zonelist->_zonerefs[j].zone = NULL;zonelist->_zonerefs[j].zone_idx = 0;}
/*** find_next_best_node - find the next node that should appear in a given node's fallback list* @node: node whose fallback list we're appending* @used_node_mask: nodemask_t of already used nodes** We use a number of factors to determine which is the next node that should* appear on a given node's fallback list. The node should not have appeared* already in @node's fallback list, and it should be the next closest node* according to the distance array (which contains arbitrary distance values* from each node to each node in the system), and should also prefer nodes* with no CPUs, since presumably they'll have very little allocation pressure* on them otherwise.* It returns -1 if no node is found.*/static int find_next_best_node(int node, nodemask_t *used_node_mask){int n, val;int min_val = INT_MAX;int best_node = -1;const struct cpumask *tmp = cpumask_of_node(0);// 先返回local _nodeif (!node_isset(node, *used_node_mask)) {node_set(node, *used_node_mask);return node;}for_each_node_state(n, N_HIGH_MEMORY) {// 设置过则不再设置if (node_isset(n, *used_node_mask))continue;/* Use the distance array to find the distance */val = node_distance(node, n);/* Penalize nodes under us ("prefer the next node") */val += (n < node);/* Give preference to headless and unused nodes */tmp = cpumask_of_node(n);if (!cpumask_empty(tmp))val += PENALTY_FOR_NODE_WITH_CPUS;/* Slight preference for less loaded node */val *= (MAX_NODE_LOAD*MAX_NUMNODES);val += node_load[n];if (val < min_val) {min_val = val;best_node = n;}}if (best_node >= 0)node_set(best_node, *used_node_mask);return best_node;}
4、初始化伙伴系统
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata){int aligned;struct page *page;unsigned long start, end, pages, count = 0;if (!bdata->node_bootmem_map)return 0;start = bdata->node_min_pfn;end = bdata->node_low_pfn;aligned = !(start & (BITS_PER_LONG - 1));while (start < end) {unsigned long *map, idx, vec;map = bdata->node_bootmem_map;idx = start - bdata->node_min_pfn;vec = ~map[idx / BITS_PER_LONG];if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {int order = ilog2(BITS_PER_LONG);__free_pages_bootmem(pfn_to_page(start), order);count += BITS_PER_LONG;} else {unsigned long off = 0;while (vec && off < BITS_PER_LONG) {if (vec & 1) {page = pfn_to_page(start + off);__free_pages_bootmem(page, 0);count++;}vec >>= 1;off++;}}start += BITS_PER_LONG;}page = virt_to_page(bdata->node_bootmem_map);pages = bdata->node_low_pfn - bdata->node_min_pfn;pages = bootmem_bootmap_pages(pages);count += pages;while (pages--)__free_pages_bootmem(page++, 0);return count;}void __meminit __free_pages_bootmem(struct page *page, unsigned int order){if (order == 0) {__ClearPageReserved(page);set_page_count(page, 0);set_page_refcounted(page);__free_page(page);} else {int loop;prefetchw(page);for (loop = 0; loop < BITS_PER_LONG; loop++) {struct page *p = &page[loop];if (loop + 1 < BITS_PER_LONG)prefetchw(p + 1);__ClearPageReserved(p);set_page_count(p, 0);}set_page_refcounted(page);__free_pages(page, order);}}void __free_pages(struct page *page, unsigned int order){if (put_page_testzero(page)) {if (order == 0)free_hot_cold_page(page, 0);else__free_pages_ok(page, order);}}static void __free_pages_ok(struct page *page, unsigned int order){unsigned long flags;int i;int bad = 0;int wasMlocked = __TestClearPageMlocked(page);trace_mm_page_free_direct(page, order);kmemcheck_free_shadow(page, order);for (i = 0 ; i < (1 << order) ; ++i)bad += free_pages_check(page + i);if (bad)return;if (!PageHighMem(page)) {debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);debug_check_no_obj_freed(page_address(page),PAGE_SIZE << order);}arch_free_page(page, order);kernel_map_pages(page, 1 << order, 0);local_irq_save(flags);if (unlikely(wasMlocked))free_page_mlock(page);__count_vm_events(PGFREE, 1 << order);free_one_page(page_zone(page), page, order,get_pageblock_migratetype(page));local_irq_restore(flags);}
4.1、free_one_page(core func)
static void free_one_page(struct zone *zone, struct page *page, int order,int migratetype){spin_lock(&zone->lock);zone->all_unreclaimable = 0;zone->pages_scanned = 0;__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);__free_one_page(page, zone, order, migratetype);spin_unlock(&zone->lock);}/** Freeing function for a buddy system allocator.** The concept of a buddy system is to maintain direct-mapped table* (containing bit values) for memory blocks of various "orders".* The bottom level table contains the map for the smallest allocatable* units of memory (here, pages), and each level above it describes* pairs of units from the levels below, hence, "buddies".* At a high level, all that happens here is marking the table entry* at the bottom level available, and propagating the changes upward* as necessary, plus some accounting needed to play nicely with other* parts of the VM system.* At each level, we keep a list of pages, which are heads of continuous* free pages of length of (1 << order) and marked with PG_buddy. Page's* order is recorded in page_private(page) field.* So when we are allocating or freeing one, we can derive the state of the* other. That is, if we allocate a small block, and both were* free, the remainder of the region must be split into blocks.* If a block is freed, and its buddy is also free, then this* triggers coalescing into a block of larger size.** -- wli*/static inline void __free_one_page(struct page *page,struct zone *zone, unsigned int order,int migratetype){unsigned long page_idx;if (unlikely(PageCompound(page)))if (unlikely(destroy_compound_page(page, order)))return;VM_BUG_ON(migratetype == -1);page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);VM_BUG_ON(page_idx & ((1 << order) - 1));VM_BUG_ON(bad_range(zone, page));while (order < MAX_ORDER-1) {unsigned long combined_idx;struct page *buddy;buddy = __page_find_buddy(page, page_idx, order);if (!page_is_buddy(page, buddy, order))break;/* Our buddy is free, merge with it and move up one order. */list_del(&buddy->lru);zone->free_area[order].nr_free--;rmv_page_order(buddy);combined_idx = __find_combined_index(page_idx, order);page = page + (combined_idx - page_idx);page_idx = combined_idx;order++;}set_page_order(page, order);list_add(&page->lru,&zone->free_area[order].free_list[migratetype]);zone->free_area[order].nr_free++;}
