e820初始化相关暂时不看,只看伙伴系统、slab初始化相关。e820就是探测可用内存

1、初始化pglist_data

struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;

1.1、initmem_init(初始化每个node节点的管理内存大小)

arch -> x86 -> mm -> numa.c

  1. void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
  2. int acpi, int k8)
  3. {
  4. // 前后均删除代码,我们只关心k8 scan成功的情况
  5. if (!numa_off && k8 && !k8_scan_nodes())
  6. return;
  7. }
  8. // nodes_parsed 为初始化的时候探测到的node id,本质上是一个bitmap
  9. // 探测到一个node id,则对应node id索引的字节为1
  10. // 比如:nodes_parsed初始化为0000
  11. // 探测到的node id为0/1/3,则nodes_parsed为0111
  12. int __init k8_scan_nodes(void)
  13. {
  14. // acpi的相关逻辑先删除,不知道是啥玩意
  15. int i;
  16. node_possible_map = nodes_parsed;
  17. // 计算memnode_shift逻辑先删除,不知道干啥用的
  18. // 遍历node_possible_map,此处本质上就是nodes_parsed
  19. for_each_node_mask(i, node_possible_map) {
  20. // 通过和扫描到的node内存范围和e820实际内存范围做比对
  21. // 将实际可用(非空洞)内存加入到early_node_map中
  22. // early_node_map就是nid和起止页框号的映射关系
  23. e820_register_active_regions(i,
  24. nodes[i].start >> PAGE_SHIFT,
  25. nodes[i].end >> PAGE_SHIFT);
  26. // 设置每个node的bootmem
  27. setup_node_bootmem(i, nodes[i].start, nodes[i].end);
  28. }
  29. numa_init_array();
  30. return 0;
  31. }

1.2、setup_node_bootmem(设置node的bootmem)

arch -> x86 -> mm -> numa_64.c

  1. // __pa把虚拟地址转换为物理地址
  2. // __va把虚物理地址转换为虚拟地址
  3. // reserve_early将已经分配的内存记录下来
  4. void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
  5. {
  6. // 开始页框号/终止页框号/notedata物理地址
  7. unsigned long start_pfn, last_pfn, nodedata_phys;
  8. // 按照页大小取整获取pgdat_size
  9. const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
  10. // node id
  11. int nid;
  12. unsigned long bootmap_start, bootmap_pages, bootmap_size;
  13. void *bootmap;
  14. if (!end)
  15. return;
  16. /*
  17. * Don't confuse VM with a node that doesn't have the
  18. * minimum amount of memory:
  19. */
  20. if (end && (end - start) < NODE_MIN_SIZE)
  21. return;
  22. // start按照能够整除最大order页面大小取整
  23. start = roundup(start, ZONE_ALIGN);
  24. start_pfn = start >> PAGE_SHIFT;
  25. last_pfn = end >> PAGE_SHIFT;
  26. // 给pgdata分配内存,静态分配的是指针
  27. // 此处需要分配实际内存
  28. node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
  29. SMP_CACHE_BYTES);
  30. if (node_data[nodeid] == NULL)
  31. return;
  32. // 获取分配地址的物理地址
  33. nodedata_phys = __pa(node_data[nodeid]);
  34. reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
  35. // 根据物理地址获取node id
  36. // nid可能会和node id不一样
  37. // 因为会优先分配到node id为0的id上面
  38. nid = phys_to_nid(nodedata_phys);
  39. // 初始化pgdata
  40. memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
  41. NODE_DATA(nodeid)->node_id = nodeid;
  42. NODE_DATA(nodeid)->node_start_pfn = start_pfn;
  43. NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
  44. NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
  45. // 计算给定数目的页框需要多大的bitmap size
  46. bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
  47. // 将bootmap其实地址在node data之后按照page_size对齐
  48. bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
  49. // 给bootmap分配内存
  50. bootmap = early_node_mem(nodeid, bootmap_start, end,
  51. bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
  52. if (bootmap == NULL) {
  53. free_early(nodedata_phys, nodedata_phys + pgdat_size);
  54. node_data[nodeid] = NULL;
  55. return;
  56. }
  57. bootmap_start = __pa(bootmap);
  58. reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
  59. "BOOTMAP");
  60. // 初始化bootmem
  61. bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
  62. bootmap_start >> PAGE_SHIFT,
  63. start_pfn, last_pfn);
  64. // 根据物理地址获取node id
  65. // nid可能会和node id不一样
  66. // 因为会优先分配到node id为0的id上面
  67. nid = phys_to_nid(bootmap_start);
  68. free_bootmem_with_active_regions(nodeid, end);
  69. // 设置node online
  70. node_set_online(nodeid);
  71. }
  72. // early_node_mem为早期分配内存的函数
  73. // 本质上是基于e820进行的内存分配
  74. // start/end为该node管理的内存范围
  75. // size为要分配的大小
  76. // align为对其大小
  77. static void * __init early_node_mem(int nodeid, unsigned long start,
  78. unsigned long end, unsigned long size,
  79. unsigned long align)
  80. {
  81. unsigned long mem;
  82. if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
  83. start = MAX_DMA_PFN<<PAGE_SHIFT;
  84. if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
  85. end > (MAX_DMA32_PFN<<PAGE_SHIFT))
  86. start = MAX_DMA32_PFN<<PAGE_SHIFT;
  87. mem = find_e820_area(start, end, size, align);
  88. if (mem != -1L)
  89. return __va(mem);
  90. end = max_pfn_mapped << PAGE_SHIFT;
  91. if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
  92. start = MAX_DMA32_PFN<<PAGE_SHIFT;
  93. else
  94. start = MAX_DMA_PFN<<PAGE_SHIFT;
  95. mem = find_e820_area(start, end, size, align);
  96. if (mem != -1L)
  97. return __va(mem);
  98. return NULL;
  99. }

1.3、init_bootmem_node(初始化bootmem核心函数)

  1. /**
  2. * init_bootmem_node - register a node as boot memory
  3. * @pgdat: node to register
  4. * @freepfn: pfn where the bitmap for this node is to be placed
  5. * @startpfn: first pfn on the node
  6. * @endpfn: first pfn after the node
  7. *
  8. * Returns the number of bytes needed to hold the bitmap for this node.
  9. */
  10. unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
  11. unsigned long startpfn, unsigned long endpfn)
  12. {
  13. return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
  14. }
  15. static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
  16. unsigned long mapstart, unsigned long start, unsigned long end)
  17. {
  18. unsigned long mapsize;
  19. // 本函数在bootmem模式下啥也不干
  20. mminit_validate_memmodel_limits(&start, &end);
  21. // 初始化bdata值
  22. // node_bootmem_map为数组起始地址
  23. bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
  24. bdata->node_min_pfn = start;
  25. bdata->node_low_pfn = end;
  26. // 将bdata链接到bdata_list
  27. link_bootmem(bdata);
  28. mapsize = bootmap_bytes(end - start);
  29. memset(bdata->node_bootmem_map, 0xff, mapsize);
  30. return mapsize;
  31. }

1.4、free_bootmem_with_active_regions(释放可用的page到bootmem)

  1. void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
  2. {
  3. int i;
  4. // 遍历给定nid的所有range
  5. for_each_active_range_index_in_nid(i, nid) {
  6. unsigned long size_pages = 0;
  7. unsigned long end_pfn = early_node_map[i].end_pfn;
  8. if (early_node_map[i].start_pfn >= max_low_pfn)
  9. continue;
  10. if (end_pfn > max_low_pfn)
  11. end_pfn = max_low_pfn;
  12. size_pages = end_pfn - early_node_map[i].start_pfn;
  13. // 释放给定起始位置和大小的页面
  14. free_bootmem_node(NODE_DATA(early_node_map[i].nid),
  15. PFN_PHYS(early_node_map[i].start_pfn),
  16. size_pages << PAGE_SHIFT);
  17. }
  18. }
  19. void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
  20. unsigned long size)
  21. {
  22. unsigned long start, end;
  23. kmemleak_free_part(__va(physaddr), size);
  24. start = PFN_UP(physaddr);
  25. end = PFN_DOWN(physaddr + size);
  26. mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
  27. }

2、初始化pg_data_t和zone相关数据结构

arch -> x86 -> mm -> int_64.c

  1. void __init paging_init(void)
  2. {
  3. unsigned long max_zone_pfns[MAX_NR_ZONES];
  4. memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
  5. max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
  6. max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
  7. max_zone_pfns[ZONE_NORMAL] = max_pfn;
  8. // ??????,先待定
  9. sparse_memory_present_with_active_regions(MAX_NUMNODES);
  10. sparse_init();
  11. // 此函数当前啥也不干,跳过
  12. node_clear_state(0, N_NORMAL_MEMORY);
  13. free_area_init_nodes(max_zone_pfns);
  14. }
  15. // max_zone_pfn是一个数组地址,代表每个zone最大的页框号
  16. // {10, 20, 40} 则代表:
  17. // ZONE_DMA最大10个页框
  18. // ZONE_DMA32最大20个页框
  19. // ZONE_NORMAL最大40个页框
  20. void __init free_area_init_nodes(unsigned long *max_zone_pfn)
  21. {
  22. unsigned long nid;
  23. int i;
  24. // 按照start_pfn给early_node_map排序
  25. sort_node_map();
  26. // 记录不同zone的边界
  27. memset(arch_zone_lowest_possible_pfn, 0,
  28. sizeof(arch_zone_lowest_possible_pfn));
  29. memset(arch_zone_highest_possible_pfn, 0,
  30. sizeof(arch_zone_highest_possible_pfn));
  31. // find_min_pfn_with_active_regions 遍历所有的 early_node_map 元素
  32. // 找到最小的start_pfn
  33. arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
  34. // max_zone_pfn[0] 为 MAX_DMA_PFN
  35. arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
  36. for (i = 1; i < MAX_NR_ZONES; i++) {
  37. if (i == ZONE_MOVABLE)
  38. continue;
  39. arch_zone_lowest_possible_pfn[i] =
  40. arch_zone_highest_possible_pfn[i-1];
  41. arch_zone_highest_possible_pfn[i] =
  42. max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
  43. }
  44. arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
  45. arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
  46. // 经过上述代码,则arch_zone_lowest_possible_pfn/arch_zone_highest_possible_pfn
  47. // 可以有以下情况
  48. // arch_zone_lowest_possible_pfn {1, 4, 6}
  49. // arch_zone_highest_possible_pfn {4, 6, 8}
  50. // DMA pfn {1, 4}, DMA_32 pfn {4, 6}, NORMAL pfn {6, 8}
  51. // 计算每个node movable 内存区间的起始 pfn
  52. memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
  53. find_zone_movable_pfns_for_nodes(zone_movable_pfn);
  54. // print mem 代码删除
  55. // debug用,跳过
  56. mminit_verify_pageflags_layout();
  57. // 设置最大的node id
  58. setup_nr_node_ids();
  59. // 遍历所有在线的node
  60. // 进行zone的初始化
  61. for_each_online_node(nid) {
  62. pg_data_t *pgdat = NODE_DATA(nid);
  63. free_area_init_node(nid, NULL, find_min_pfn_for_node(nid), NULL);
  64. if (pgdat->node_present_pages)
  65. node_set_state(nid, N_HIGH_MEMORY);
  66. check_for_regular_memory(pgdat);
  67. }
  68. }
  69. void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
  70. unsigned long node_start_pfn, unsigned long *zholes_size)
  71. {
  72. pg_data_t *pgdat = NODE_DATA(nid);
  73. pgdat->node_id = nid;
  74. pgdat->node_start_pfn = node_start_pfn;
  75. // 计算所有页。包括包含hole和不包含hole
  76. // 填充pgdat->node_spanned_pages和pgdat->node_present_pages
  77. calculate_node_totalpages(pgdat, zones_size, zholes_size);
  78. // 给node_mem_map分配内存,使用bootmem内存分配器
  79. // 本质上是分配该node所有page的数组
  80. alloc_node_mem_map(pgdat);
  81. // 填充zone
  82. free_area_init_core(pgdat, zones_size, zholes_size);
  83. }

2.1、free_area_init_core(核心逻辑)

  1. // 目前看zones_size/zholes_size均为NULL
  2. static void __paginginit free_area_init_core(struct pglist_data *pgdat,
  3. unsigned long *zones_size, unsigned long *zholes_size)
  4. {
  5. enum zone_type j;
  6. int nid = pgdat->node_id;
  7. unsigned long zone_start_pfn = pgdat->node_start_pfn;
  8. int ret;
  9. // 初始化node_size_lock
  10. pgdat_resize_init(pgdat);
  11. pgdat->nr_zones = 0;
  12. init_waitqueue_head(&pgdat->kswapd_wait);
  13. pgdat->kswapd_max_order = 0;
  14. // cgroup先忽略
  15. pgdat_page_cgroup_init(pgdat);
  16. for (j = 0; j < MAX_NR_ZONES; j++) {
  17. struct zone *zone = pgdat->node_zones + j;
  18. unsigned long size, realsize, memmap_pages;
  19. enum lru_list l;
  20. // 分别计算该zone的所有页面和不包含hole的页面
  21. // size指的是页框号数量
  22. size = zone_spanned_pages_in_node(nid, j, zones_size);
  23. realsize = size - zone_absent_pages_in_node(nid, j,
  24. zholes_size);
  25. // 计算所有page需要的内存大小
  26. memmap_pages = PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
  27. // 此处考虑了内存小的情况,我们此处不考虑,代码已删除
  28. // 除去dma预留
  29. if (j == 0 && realsize > dma_reserve) {
  30. realsize -= dma_reserve;
  31. }
  32. // nr_kernel_pages/nr_all_pages分别++
  33. if (!is_highmem_idx(j))
  34. // 不配置highmem则内核可用页++
  35. nr_kernel_pages/ += realsize;
  36. nr_all_pages += realsize;
  37. zone->spanned_pages = size;
  38. zone->present_pages = realsize;
  39. zone->node = nid;
  40. // 超过这么多数量的可回收页面,才进行页面回收
  41. zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) / 100;
  42. // 超过这么多可回收的slab页面才进行回收
  43. zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
  44. zone->name = zone_names[j];
  45. spin_lock_init(&zone->lock);
  46. spin_lock_init(&zone->lru_lock);
  47. // 内存热插拔相关
  48. zone_seqlock_init(zone);
  49. zone->zone_pgdat = pgdat;
  50. zone->prev_priority = DEF_PRIORITY;
  51. // ??????
  52. zone_pcp_init(zone);
  53. // 初始化一坨坨变量
  54. for_each_lru(l) {
  55. INIT_LIST_HEAD(&zone->lru[l].list);
  56. zone->reclaim_stat.nr_saved_scan[l] = 0;
  57. }
  58. zone->reclaim_stat.recent_rotated[0] = 0;
  59. zone->reclaim_stat.recent_rotated[1] = 0;
  60. zone->reclaim_stat.recent_scanned[0] = 0;
  61. zone->reclaim_stat.recent_scanned[1] = 0;
  62. zap_zone_vm_stats(zone);
  63. zone->flags = 0;
  64. if (!size)
  65. continue;
  66. // ??????
  67. set_pageblock_order(pageblock_default_order());
  68. setup_usemap(pgdat, zone, size);
  69. // 初始化其实页框号和free_list链表
  70. ret = init_currently_empty_zone(zone, zone_start_pfn,
  71. size, MEMMAP_EARLY);
  72. // 初始化page
  73. memmap_init(size, nid, j, zone_start_pfn);
  74. zone_start_pfn += size;
  75. }
  76. }

2.2、helper func

2.2.1、memmap_init_zone(初始化所有预留的页面)

  1. /*
  2. * Initially all pages are reserved - free ones are freed
  3. * up by free_all_bootmem() once the early boot process is
  4. * done. Non-atomic initialization, single-pass.
  5. */
  6. void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
  7. unsigned long start_pfn, enum memmap_context context)
  8. {
  9. struct page *page;
  10. unsigned long end_pfn = start_pfn + size;
  11. unsigned long pfn;
  12. struct zone *z;
  13. if (highest_memmap_pfn < end_pfn - 1)
  14. highest_memmap_pfn = end_pfn - 1;
  15. z = &NODE_DATA(nid)->node_zones[zone];
  16. for (pfn = start_pfn; pfn < end_pfn; pfn++) {
  17. /*
  18. * There can be holes in boot-time mem_map[]s
  19. * handed to this function. They do not
  20. * exist on hotplugged memory.
  21. */
  22. if (context == MEMMAP_EARLY) {
  23. if (!early_pfn_valid(pfn))
  24. continue;
  25. if (!early_pfn_in_nid(pfn, nid))
  26. continue;
  27. }
  28. page = pfn_to_page(pfn);
  29. // 设置page flags,zone/node/section
  30. set_page_links(page, zone, nid, pfn);
  31. // 一些校验
  32. mminit_verify_page_links(page, zone, nid, pfn);
  33. // 设置page->_count为1
  34. init_page_count(page);
  35. // 设置page->mapcount为1
  36. reset_page_mapcount(page);
  37. SetPageReserved(page);
  38. /*
  39. * Mark the block movable so that blocks are reserved for
  40. * movable at startup. This will force kernel allocations
  41. * to reserve their blocks rather than leaking throughout
  42. * the address space during boot when many long-lived
  43. * kernel allocations are made. Later some blocks near
  44. * the start are marked MIGRATE_RESERVE by
  45. * setup_zone_migrate_reserve()
  46. *
  47. * bitmap is created for zone's valid pfn range. but memmap
  48. * can be created for invalid pages (for alignment)
  49. * check here not to call set_pageblock_migratetype() against
  50. * pfn out of zone.
  51. */
  52. if ((z->zone_start_pfn <= pfn)
  53. && (pfn < z->zone_start_pfn + z->spanned_pages)
  54. && !(pfn & (pageblock_nr_pages - 1)))
  55. set_pageblock_migratetype(page, MIGRATE_MOVABLE);
  56. // 初始化lru
  57. INIT_LIST_HEAD(&page->lru);
  58. #ifdef WANT_PAGE_VIRTUAL
  59. /* The shift won't overflow because ZONE_NORMAL is below 4G. */
  60. if (!is_highmem_idx(zone))
  61. set_page_address(page, __va(pfn << PAGE_SHIFT));
  62. #endif
  63. }
  64. }

2.2.2、find_zone_movable_pfns_for_nodes(看不懂啊)

  1. /*
  2. * Find the PFN the Movable zone begins in each node. Kernel memory
  3. * is spread evenly between nodes as long as the nodes have enough
  4. * memory. When they don't, some nodes will have more kernelcore than
  5. * others
  6. */
  7. static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
  8. {
  9. int i, nid;
  10. unsigned long usable_startpfn;
  11. unsigned long kernelcore_node, kernelcore_remaining;
  12. /* save the state before borrow the nodemask */
  13. nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
  14. unsigned long totalpages = early_calculate_totalpages();
  15. int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
  16. /*
  17. * If movablecore was specified, calculate what size of
  18. * kernelcore that corresponds so that memory usable for
  19. * any allocation type is evenly spread. If both kernelcore
  20. * and movablecore are specified, then the value of kernelcore
  21. * will be used for required_kernelcore if it's greater than
  22. * what movablecore would have allowed.
  23. */
  24. if (required_movablecore) {
  25. unsigned long corepages;
  26. /*
  27. * Round-up so that ZONE_MOVABLE is at least as large as what
  28. * was requested by the user
  29. */
  30. required_movablecore =
  31. roundup(required_movablecore, MAX_ORDER_NR_PAGES);
  32. corepages = totalpages - required_movablecore;
  33. required_kernelcore = max(required_kernelcore, corepages);
  34. }
  35. /* If kernelcore was not specified, there is no ZONE_MOVABLE */
  36. if (!required_kernelcore)
  37. goto out;
  38. /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
  39. find_usable_zone_for_movable();
  40. usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
  41. restart:
  42. /* Spread kernelcore memory as evenly as possible throughout nodes */
  43. kernelcore_node = required_kernelcore / usable_nodes;
  44. for_each_node_state(nid, N_HIGH_MEMORY) {
  45. /*
  46. * Recalculate kernelcore_node if the division per node
  47. * now exceeds what is necessary to satisfy the requested
  48. * amount of memory for the kernel
  49. */
  50. if (required_kernelcore < kernelcore_node)
  51. kernelcore_node = required_kernelcore / usable_nodes;
  52. /*
  53. * As the map is walked, we track how much memory is usable
  54. * by the kernel using kernelcore_remaining. When it is
  55. * 0, the rest of the node is usable by ZONE_MOVABLE
  56. */
  57. kernelcore_remaining = kernelcore_node;
  58. /* Go through each range of PFNs within this node */
  59. for_each_active_range_index_in_nid(i, nid) {
  60. unsigned long start_pfn, end_pfn;
  61. unsigned long size_pages;
  62. start_pfn = max(early_node_map[i].start_pfn,
  63. zone_movable_pfn[nid]);
  64. end_pfn = early_node_map[i].end_pfn;
  65. if (start_pfn >= end_pfn)
  66. continue;
  67. /* Account for what is only usable for kernelcore */
  68. if (start_pfn < usable_startpfn) {
  69. unsigned long kernel_pages;
  70. kernel_pages = min(end_pfn, usable_startpfn)
  71. - start_pfn;
  72. kernelcore_remaining -= min(kernel_pages,
  73. kernelcore_remaining);
  74. required_kernelcore -= min(kernel_pages,
  75. required_kernelcore);
  76. /* Continue if range is now fully accounted */
  77. if (end_pfn <= usable_startpfn) {
  78. /*
  79. * Push zone_movable_pfn to the end so
  80. * that if we have to rebalance
  81. * kernelcore across nodes, we will
  82. * not double account here
  83. */
  84. zone_movable_pfn[nid] = end_pfn;
  85. continue;
  86. }
  87. start_pfn = usable_startpfn;
  88. }
  89. /*
  90. * The usable PFN range for ZONE_MOVABLE is from
  91. * start_pfn->end_pfn. Calculate size_pages as the
  92. * number of pages used as kernelcore
  93. */
  94. size_pages = end_pfn - start_pfn;
  95. if (size_pages > kernelcore_remaining)
  96. size_pages = kernelcore_remaining;
  97. zone_movable_pfn[nid] = start_pfn + size_pages;
  98. /*
  99. * Some kernelcore has been met, update counts and
  100. * break if the kernelcore for this node has been
  101. * satisified
  102. */
  103. required_kernelcore -= min(required_kernelcore,
  104. size_pages);
  105. kernelcore_remaining -= size_pages;
  106. if (!kernelcore_remaining)
  107. break;
  108. }
  109. }
  110. /*
  111. * If there is still required_kernelcore, we do another pass with one
  112. * less node in the count. This will push zone_movable_pfn[nid] further
  113. * along on the nodes that still have memory until kernelcore is
  114. * satisified
  115. */
  116. usable_nodes--;
  117. if (usable_nodes && required_kernelcore > usable_nodes)
  118. goto restart;
  119. /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
  120. for (nid = 0; nid < MAX_NUMNODES; nid++)
  121. zone_movable_pfn[nid] =
  122. roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
  123. out:
  124. /* restore the node_state */
  125. node_states[N_HIGH_MEMORY] = saved_node_state;
  126. }

3、构建zonelist

  1. void build_all_zonelists(void)
  2. {
  3. // 默认可以认为按照ZONELIST_ORDER_NODE来排序
  4. set_zonelist_order();
  5. if (system_state == SYSTEM_BOOTING) {
  6. __build_all_zonelists(NULL);
  7. mminit_verify_zonelist();
  8. cpuset_init_current_mems_allowed();
  9. } else {
  10. /* we have to stop all cpus to guarantee there is no user
  11. of zonelist */
  12. stop_machine(__build_all_zonelists, NULL, NULL);
  13. /* cpuset refresh routine should be here */
  14. }
  15. vm_total_pages = nr_free_pagecache_pages();
  16. /*
  17. * Disable grouping by mobility if the number of pages in the
  18. * system is too low to allow the mechanism to work. It would be
  19. * more accurate, but expensive to check per-zone. This check is
  20. * made on memory-hotadd so a system can start with mobility
  21. * disabled and enable it later
  22. */
  23. if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
  24. page_group_by_mobility_disabled = 1;
  25. else
  26. page_group_by_mobility_disabled = 0;
  27. }
  28. static int __build_all_zonelists(void *dummy)
  29. {
  30. int nid;
  31. int cpu;
  32. memset(node_load, 0, sizeof(node_load));
  33. for_each_online_node(nid) {
  34. pg_data_t *pgdat = NODE_DATA(nid);
  35. build_zonelists(pgdat);
  36. build_zonelist_cache(pgdat);
  37. }
  38. /*
  39. * Initialize the boot_pagesets that are going to be used
  40. * for bootstrapping processors. The real pagesets for
  41. * each zone will be allocated later when the per cpu
  42. * allocator is available.
  43. *
  44. * boot_pagesets are used also for bootstrapping offline
  45. * cpus if the system is already booted because the pagesets
  46. * are needed to initialize allocators on a specific cpu too.
  47. * F.e. the percpu allocator needs the page allocator which
  48. * needs the percpu allocator in order to allocate its pagesets
  49. * (a chicken-egg dilemma).
  50. */
  51. for_each_possible_cpu(cpu)
  52. setup_pageset(&per_cpu(boot_pageset, cpu), 0);
  53. return 0;
  54. }

3.1、helper func

3.1.1、build_zonelists

static int node_order[MAX_NUMNODES]

  1. static void build_zonelists(pg_data_t *pgdat)
  2. {
  3. int j, node, load;
  4. enum zone_type i;
  5. nodemask_t used_mask;
  6. int local_node, prev_node;
  7. struct zonelist *zonelist;
  8. int order = current_zonelist_order;
  9. for (i = 0; i < MAX_ZONELISTS; i++) {
  10. zonelist = pgdat->node_zonelists + i;
  11. zonelist->_zonerefs[0].zone = NULL;
  12. zonelist->_zonerefs[0].zone_idx = 0;
  13. }
  14. local_node = pgdat->node_id;
  15. load = nr_online_nodes;
  16. prev_node = local_node;
  17. nodes_clear(used_mask);
  18. memset(node_order, 0, sizeof(node_order));
  19. j = 0;
  20. // 依次寻找距离local_node最近的node
  21. while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
  22. int distance = node_distance(local_node, node);
  23. /*
  24. * If another node is sufficiently far away then it is better
  25. * to reclaim pages in a zone before going off node.
  26. */
  27. if (distance > RECLAIM_DISTANCE)
  28. zone_reclaim_mode = 1;
  29. /*
  30. * We don't want to pressure a particular node.
  31. * So adding penalty to the first node in same
  32. * distance group to make it round-robin.
  33. */
  34. if (distance != node_distance(local_node, prev_node))
  35. node_load[node] = load;
  36. prev_node = node;
  37. load--;
  38. if (order == ZONELIST_ORDER_NODE)
  39. build_zonelists_in_node_order(pgdat, node);
  40. else
  41. node_order[j++] = node; /* remember order */
  42. }
  43. if (order == ZONELIST_ORDER_ZONE) {
  44. /* calculate node order -- i.e., DMA last! */
  45. build_zonelists_in_zone_order(pgdat, j);
  46. }
  47. build_thisnode_zonelists(pgdat);
  48. }
  1. /*
  2. * Build zonelists ordered by node and zones within node.
  3. * This results in maximum locality--normal zone overflows into local
  4. * DMA zone, if any--but risks exhausting DMA zone.
  5. */
  6. static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
  7. {
  8. int j;
  9. struct zonelist *zonelist;
  10. zonelist = &pgdat->node_zonelists[0];
  11. for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
  12. ;
  13. j = build_zonelists_node(NODE_DATA(node), zonelist, j,
  14. MAX_NR_ZONES - 1);
  15. zonelist->_zonerefs[j].zone = NULL;
  16. zonelist->_zonerefs[j].zone_idx = 0;
  17. }
  1. /*
  2. * Build gfp_thisnode zonelists
  3. */
  4. static void build_thisnode_zonelists(pg_data_t *pgdat)
  5. {
  6. int j;
  7. struct zonelist *zonelist;
  8. zonelist = &pgdat->node_zonelists[1];
  9. j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
  10. zonelist->_zonerefs[j].zone = NULL;
  11. zonelist->_zonerefs[j].zone_idx = 0;
  12. }
  1. /**
  2. * find_next_best_node - find the next node that should appear in a given node's fallback list
  3. * @node: node whose fallback list we're appending
  4. * @used_node_mask: nodemask_t of already used nodes
  5. *
  6. * We use a number of factors to determine which is the next node that should
  7. * appear on a given node's fallback list. The node should not have appeared
  8. * already in @node's fallback list, and it should be the next closest node
  9. * according to the distance array (which contains arbitrary distance values
  10. * from each node to each node in the system), and should also prefer nodes
  11. * with no CPUs, since presumably they'll have very little allocation pressure
  12. * on them otherwise.
  13. * It returns -1 if no node is found.
  14. */
  15. static int find_next_best_node(int node, nodemask_t *used_node_mask)
  16. {
  17. int n, val;
  18. int min_val = INT_MAX;
  19. int best_node = -1;
  20. const struct cpumask *tmp = cpumask_of_node(0);
  21. // 先返回local _node
  22. if (!node_isset(node, *used_node_mask)) {
  23. node_set(node, *used_node_mask);
  24. return node;
  25. }
  26. for_each_node_state(n, N_HIGH_MEMORY) {
  27. // 设置过则不再设置
  28. if (node_isset(n, *used_node_mask))
  29. continue;
  30. /* Use the distance array to find the distance */
  31. val = node_distance(node, n);
  32. /* Penalize nodes under us ("prefer the next node") */
  33. val += (n < node);
  34. /* Give preference to headless and unused nodes */
  35. tmp = cpumask_of_node(n);
  36. if (!cpumask_empty(tmp))
  37. val += PENALTY_FOR_NODE_WITH_CPUS;
  38. /* Slight preference for less loaded node */
  39. val *= (MAX_NODE_LOAD*MAX_NUMNODES);
  40. val += node_load[n];
  41. if (val < min_val) {
  42. min_val = val;
  43. best_node = n;
  44. }
  45. }
  46. if (best_node >= 0)
  47. node_set(best_node, *used_node_mask);
  48. return best_node;
  49. }

4、初始化伙伴系统

  1. static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
  2. {
  3. int aligned;
  4. struct page *page;
  5. unsigned long start, end, pages, count = 0;
  6. if (!bdata->node_bootmem_map)
  7. return 0;
  8. start = bdata->node_min_pfn;
  9. end = bdata->node_low_pfn;
  10. aligned = !(start & (BITS_PER_LONG - 1));
  11. while (start < end) {
  12. unsigned long *map, idx, vec;
  13. map = bdata->node_bootmem_map;
  14. idx = start - bdata->node_min_pfn;
  15. vec = ~map[idx / BITS_PER_LONG];
  16. if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
  17. int order = ilog2(BITS_PER_LONG);
  18. __free_pages_bootmem(pfn_to_page(start), order);
  19. count += BITS_PER_LONG;
  20. } else {
  21. unsigned long off = 0;
  22. while (vec && off < BITS_PER_LONG) {
  23. if (vec & 1) {
  24. page = pfn_to_page(start + off);
  25. __free_pages_bootmem(page, 0);
  26. count++;
  27. }
  28. vec >>= 1;
  29. off++;
  30. }
  31. }
  32. start += BITS_PER_LONG;
  33. }
  34. page = virt_to_page(bdata->node_bootmem_map);
  35. pages = bdata->node_low_pfn - bdata->node_min_pfn;
  36. pages = bootmem_bootmap_pages(pages);
  37. count += pages;
  38. while (pages--)
  39. __free_pages_bootmem(page++, 0);
  40. return count;
  41. }
  42. void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
  43. {
  44. if (order == 0) {
  45. __ClearPageReserved(page);
  46. set_page_count(page, 0);
  47. set_page_refcounted(page);
  48. __free_page(page);
  49. } else {
  50. int loop;
  51. prefetchw(page);
  52. for (loop = 0; loop < BITS_PER_LONG; loop++) {
  53. struct page *p = &page[loop];
  54. if (loop + 1 < BITS_PER_LONG)
  55. prefetchw(p + 1);
  56. __ClearPageReserved(p);
  57. set_page_count(p, 0);
  58. }
  59. set_page_refcounted(page);
  60. __free_pages(page, order);
  61. }
  62. }
  63. void __free_pages(struct page *page, unsigned int order)
  64. {
  65. if (put_page_testzero(page)) {
  66. if (order == 0)
  67. free_hot_cold_page(page, 0);
  68. else
  69. __free_pages_ok(page, order);
  70. }
  71. }
  72. static void __free_pages_ok(struct page *page, unsigned int order)
  73. {
  74. unsigned long flags;
  75. int i;
  76. int bad = 0;
  77. int wasMlocked = __TestClearPageMlocked(page);
  78. trace_mm_page_free_direct(page, order);
  79. kmemcheck_free_shadow(page, order);
  80. for (i = 0 ; i < (1 << order) ; ++i)
  81. bad += free_pages_check(page + i);
  82. if (bad)
  83. return;
  84. if (!PageHighMem(page)) {
  85. debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
  86. debug_check_no_obj_freed(page_address(page),
  87. PAGE_SIZE << order);
  88. }
  89. arch_free_page(page, order);
  90. kernel_map_pages(page, 1 << order, 0);
  91. local_irq_save(flags);
  92. if (unlikely(wasMlocked))
  93. free_page_mlock(page);
  94. __count_vm_events(PGFREE, 1 << order);
  95. free_one_page(page_zone(page), page, order,
  96. get_pageblock_migratetype(page));
  97. local_irq_restore(flags);
  98. }

4.1、free_one_page(core func)

  1. static void free_one_page(struct zone *zone, struct page *page, int order,
  2. int migratetype)
  3. {
  4. spin_lock(&zone->lock);
  5. zone->all_unreclaimable = 0;
  6. zone->pages_scanned = 0;
  7. __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
  8. __free_one_page(page, zone, order, migratetype);
  9. spin_unlock(&zone->lock);
  10. }
  11. /*
  12. * Freeing function for a buddy system allocator.
  13. *
  14. * The concept of a buddy system is to maintain direct-mapped table
  15. * (containing bit values) for memory blocks of various "orders".
  16. * The bottom level table contains the map for the smallest allocatable
  17. * units of memory (here, pages), and each level above it describes
  18. * pairs of units from the levels below, hence, "buddies".
  19. * At a high level, all that happens here is marking the table entry
  20. * at the bottom level available, and propagating the changes upward
  21. * as necessary, plus some accounting needed to play nicely with other
  22. * parts of the VM system.
  23. * At each level, we keep a list of pages, which are heads of continuous
  24. * free pages of length of (1 << order) and marked with PG_buddy. Page's
  25. * order is recorded in page_private(page) field.
  26. * So when we are allocating or freeing one, we can derive the state of the
  27. * other. That is, if we allocate a small block, and both were
  28. * free, the remainder of the region must be split into blocks.
  29. * If a block is freed, and its buddy is also free, then this
  30. * triggers coalescing into a block of larger size.
  31. *
  32. * -- wli
  33. */
  34. static inline void __free_one_page(struct page *page,
  35. struct zone *zone, unsigned int order,
  36. int migratetype)
  37. {
  38. unsigned long page_idx;
  39. if (unlikely(PageCompound(page)))
  40. if (unlikely(destroy_compound_page(page, order)))
  41. return;
  42. VM_BUG_ON(migratetype == -1);
  43. page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
  44. VM_BUG_ON(page_idx & ((1 << order) - 1));
  45. VM_BUG_ON(bad_range(zone, page));
  46. while (order < MAX_ORDER-1) {
  47. unsigned long combined_idx;
  48. struct page *buddy;
  49. buddy = __page_find_buddy(page, page_idx, order);
  50. if (!page_is_buddy(page, buddy, order))
  51. break;
  52. /* Our buddy is free, merge with it and move up one order. */
  53. list_del(&buddy->lru);
  54. zone->free_area[order].nr_free--;
  55. rmv_page_order(buddy);
  56. combined_idx = __find_combined_index(page_idx, order);
  57. page = page + (combined_idx - page_idx);
  58. page_idx = combined_idx;
  59. order++;
  60. }
  61. set_page_order(page, order);
  62. list_add(&page->lru,
  63. &zone->free_area[order].free_list[migratetype]);
  64. zone->free_area[order].nr_free++;
  65. }