1、struct pglist_data(伙伴系统顶级结构体)
typedef struct pglist_data { // 64位系统支持DMA/DMA_32/NORMAL struct zone node_zones[MAX_NR_ZONES]; // MAX_ZONELISTS为2 // [0] -> Zonelist with fallback // [1] -> No fallback (GFP_THISNODE) struct zonelist node_zonelists[MAX_ZONELISTS]; // zone的个数 int nr_zones; // 按照最初的版本,page存放在一个大数组里面 struct page *node_mem_map; struct page_cgroup *node_page_cgroup; // bootmem内存分配器 struct bootmem_data *bdata; // 支持热插拔的锁 spinlock_t node_size_lock; // 该节点起始页框号 unsigned long node_start_pfn; // 物理页面的数量 unsigned long node_present_pages; // 物理页面的大小,包括空洞 unsigned long node_spanned_pages; // 节点ID int node_id; // kswapd进程唤醒队列 wait_queue_head_t kswapd_wait; // kswapd进程task_struct struct task_struct *kswapd; // 需要回收的页面的最大大小 int kswapd_max_order;} pg_data_t;/* * One allocation request operates on a zonelist. A zonelist * is a list of zones, the first one is the 'goal' of the * allocation, the other zones are fallback zones, in decreasing * priority. * * If zlcache_ptr is not NULL, then it is just the address of zlcache, * as explained above. If zlcache_ptr is NULL, there is no zlcache. * * * To speed the reading of the zonelist, the zonerefs contain the zone index * of the entry being read. Helper functions to access information given * a struct zoneref are * * zonelist_zone() - Return the struct zone * for an entry in _zonerefs * zonelist_zone_idx() - Return the index of the zone for an entry * zonelist_node_idx() - Return the index of the node for an entry */struct zonelist { struct zonelist_cache *zlcache_ptr; // NULL or &zlcache struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; struct zonelist_cache zlcache; // optional ...};/* * This struct contains information about a zone in a zonelist. It is stored * here to avoid dereferences into large structures and lookups of tables */struct zoneref { struct zone *zone; /* Pointer to actual zone */ int zone_idx; /* zone_idx(zoneref->zone) */};/* * We cache key information from each zonelist for smaller cache * footprint when scanning for free pages in get_page_from_freelist(). * * 1) The BITMAP fullzones tracks which zones in a zonelist have come * up short of free memory since the last time (last_fullzone_zap) * we zero'd fullzones. * 2) The array z_to_n[] maps each zone in the zonelist to its node * id, so that we can efficiently evaluate whether that node is * set in the current tasks mems_allowed. * * Both fullzones and z_to_n[] are one-to-one with the zonelist, * indexed by a zones offset in the zonelist zones[] array. * * The get_page_from_freelist() routine does two scans. During the * first scan, we skip zones whose corresponding bit in 'fullzones' * is set or whose corresponding node in current->mems_allowed (which * comes from cpusets) is not set. During the second scan, we bypass * this zonelist_cache, to ensure we look methodically at each zone. * * Once per second, we zero out (zap) fullzones, forcing us to * reconsider nodes that might have regained more free memory. * The field last_full_zap is the time we last zapped fullzones. * * This mechanism reduces the amount of time we waste repeatedly * reexaming zones for free memory when they just came up low on * memory momentarilly ago. * * The zonelist_cache struct members logically belong in struct * zonelist. However, the mempolicy zonelists constructed for * MPOL_BIND are intentionally variable length (and usually much * shorter). A general purpose mechanism for handling structs with * multiple variable length members is more mechanism than we want * here. We resort to some special case hackery instead. * * The MPOL_BIND zonelists don't need this zonelist_cache (in good * part because they are shorter), so we put the fixed length stuff * at the front of the zonelist struct, ending in a variable length * zones[], as is needed by MPOL_BIND. * * Then we put the optional zonelist cache on the end of the zonelist * struct. This optional stuff is found by a 'zlcache_ptr' pointer in * the fixed length portion at the front of the struct. This pointer * both enables us to find the zonelist cache, and in the case of * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL) * to know that the zonelist cache is not there. * * The end result is that struct zonelists come in two flavors: * 1) The full, fixed length version, shown below, and * 2) The custom zonelists for MPOL_BIND. * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache. * * Even though there may be multiple CPU cores on a node modifying * fullzones or last_full_zap in the same zonelist_cache at the same * time, we don't lock it. This is just hint data - if it is wrong now * and then, the allocator will still function, perhaps a bit slower. */struct zonelist_cache { unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */ DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */ unsigned long last_full_zap; /* when last zap'd (jiffies) */};
2、struct zone_struct(维护每个zone的内存)
// 每个节点的内存分为3部分:// ZONE_DMA < 16 MB// ZONE_NORMAL 16-896 MB,直接按照PAGE_OFFSET进行直接映射// ZONE_HIGHMEM > 896 MB only page cache and user processes// ZONE_HIGHMEM也分为很多段,此处不细讲#define ZONE_DMA 0#define ZONE_NORMAL 1#define ZONE_HIGHMEM 2#define MAX_NR_ZONES 3#define ZONES_SHIFT 2#define GFP_ZONEMASK 0x03struct zone { spinlock_t lock; // 有多少空闲页面 unsigned long free_pages; // 管理区中保留页的数目 unsigned long pages_min; // 回收页框使用的下界,同时也被管理区分配器作为阀值使用,一般这个数字是pages_min的5/4 unsigned long pages_low; // 回收页框使用的上界,同时也被管理区分配器作为阀值使用,一般这个数字是pages_min的3/2 unsigned long pages_high; ZONE_PADDING(_pad1_) // 页面LRU相关变量 spinlock_t lru_lock; struct list_head active_list; struct list_head inactive_list; unsigned long nr_active; unsigned long nr_inactive; atomic_t refill_counter; int all_unreclaimable; /* All pages pinned */ unsigned long pages_scanned; /* since last reclaim */ ZONE_PADDING(_pad2_) /* * prev_priority holds the scanning priority for this zone. It is * defined as the scanning priority at which we achieved our reclaim * target at the previous try_to_free_pages() or balance_pgdat() * invokation. * * We use prev_priority as a measure of how much stress page reclaim is * under - it drives the swappiness decision: whether to unmap mapped * pages. * * temp_priority is used to remember the scanning priority at which * this zone was successfully refilled to free_pages == pages_high. * * Access to both these fields is quite racy even on uniprocessor. But * it is expected to average out OK. */ int temp_priority; int prev_priority; // 按照不同order组成的多个链表 struct free_area free_area[MAX_ORDER]; /* * wait_table -- the array holding the hash table * wait_table_size -- the size of the hash table array * wait_table_bits -- wait_table_size == (1 << wait_table_bits) * * The purpose of all these is to keep track of the people * waiting for a page to become available and make them * runnable again when possible. The trouble is that this * consumes a lot of space, especially when so few things * wait on pages at a given time. So instead of using * per-page waitqueues, we use a waitqueue hash table. * * The bucket discipline is to sleep on the same queue when * colliding and wake all in that wait queue when removing. * When something wakes, it must check to be sure its page is * truly available, a la thundering herd. The cost of a * collision is great, but given the expected load of the * table, they should be so rare as to be outweighed by the * benefits from the saved space. * * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the * primary users of these fields, and in mm/page_alloc.c * free_area_init_core() performs the initialization of them. */ // 进程等待队列,这些进程在等待管理区中的某页 wait_queue_head_t * wait_table; // 等待队列的大小 unsigned long wait_table_size; unsigned long wait_table_bits; ZONE_PADDING(_pad3_) // 每个CPU的单框页面缓存 // 非NUMA结构忽略 struct per_cpu_pageset pageset[NR_CPUS]; // 伙伴系统顶级结构体 struct pglist_data *zone_pgdat; // 属于该zone的页面数组地址 struct page *zone_mem_map; // 起始页框号 // zone_start_pfn == zone_start_paddr >> PAGE_SHIFT unsigned long zone_start_pfn; // zone名称 char *name; // zone的总大小,包含空洞 unsigned long spanned_pages; // zone的总页数,不包括空洞 unsigned long present_pages;} ____cacheline_maxaligned_in_smp;// zone的不同水位enum zone_watermarks { WMARK_MIN, WMARK_LOW, WMARK_HIGH, NR_WMARK};struct zone { unsigned long watermark[NR_WMARK]; /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several * GB of ram we must reserve some of the lower zone memory (otherwise we risk * to run OOM on the lower zones despite there's tons of freeable ram * on the higher zones). This array is recalculated at runtime if the * sysctl_lowmem_reserve_ratio sysctl changes. */ unsigned long lowmem_reserve[MAX_NR_ZONES]; // node ID int node; /* * zone reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; struct per_cpu_pageset __percpu *pageset; /* * free areas of different sizes */ spinlock_t lock; int all_unreclaimable; /* All pages pinned */ /* see spanned/present_pages for more description */ seqlock_t span_seqlock; struct free_area free_area[MAX_ORDER]; unsigned long *pageblock_flags; ZONE_PADDING(_pad1_) /* Fields commonly accessed by the page reclaim scanner */ spinlock_t lru_lock; struct zone_lru { struct list_head list; } lru[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; unsigned long pages_scanned; /* since last reclaim */ unsigned long flags; /* zone flags, see below */ /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; /* * prev_priority holds the scanning priority for this zone. It is * defined as the scanning priority at which we achieved our reclaim * target at the previous try_to_free_pages() or balance_pgdat() * invocation. * * We use prev_priority as a measure of how much stress page reclaim is * under - it drives the swappiness decision: whether to unmap mapped * pages. * * Access to both this field is quite racy even on uniprocessor. But * it is expected to average out OK. */ int prev_priority; /* * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on * this zone's LRU. Maintained by the pageout code. */ unsigned int inactive_ratio; ZONE_PADDING(_pad2_) /* Rarely used or read-mostly fields */ /* * wait_table -- the array holding the hash table * wait_table_hash_nr_entries -- the size of the hash table array * wait_table_bits -- wait_table_size == (1 << wait_table_bits) * * The purpose of all these is to keep track of the people * waiting for a page to become available and make them * runnable again when possible. The trouble is that this * consumes a lot of space, especially when so few things * wait on pages at a given time. So instead of using * per-page waitqueues, we use a waitqueue hash table. * * The bucket discipline is to sleep on the same queue when * colliding and wake all in that wait queue when removing. * When something wakes, it must check to be sure its page is * truly available, a la thundering herd. The cost of a * collision is great, but given the expected load of the * table, they should be so rare as to be outweighed by the * benefits from the saved space. * * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the * primary users of these fields, and in mm/page_alloc.c * free_area_init_core() performs the initialization of them. */ wait_queue_head_t * wait_table; unsigned long wait_table_hash_nr_entries; unsigned long wait_table_bits; /* * Discontig memory support fields. */ struct pglist_data *zone_pgdat; /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; /* * zone_start_pfn, spanned_pages and present_pages are all * protected by span_seqlock. It is a seqlock because it has * to be read outside of zone->lock, and it is done in the main * allocator path. But, it is written quite infrequently. * * The lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. */ unsigned long spanned_pages; /* total size, including holes */ unsigned long present_pages; /* amount of memory (excluding holes) */ /* * rarely used fields: */ const char *name;} ____cacheline_internodealigned_in_smp;
3、struct page(每个物理页面的描述)
// 每一个物理页在OS里面会有一个struct page结构体// 此结构体维护了一个物理页的状态信息struct page { unsigned long flags; // Atomic flags, some possibly updated asynchronously atomic_t _count; // Usage count, see below union { atomic_t _mapcount; /* Count of ptes mapped in mms, * to show when page is mapped * & limit reverse map searches. */ struct { /* SLUB */ u16 inuse; u16 objects; }; }; union { struct { unsigned long private; /* Mapping-private opaque data: * usually used for buffer_heads * if PagePrivate set; used for * swp_entry_t if PageSwapCache; * indicates order in the buddy * system if PG_buddy is set. */ struct address_space *mapping; /* If low bit clear, points to * inode address_space, or NULL. * If page mapped as anonymous * memory, low bit is set, and * it points to anon_vma object: * see PAGE_MAPPING_ANON below. */ };#if USE_SPLIT_PTLOCKS spinlock_t ptl;#endif struct kmem_cache *slab; /* SLUB: Pointer to slab */ struct page *first_page; /* Compound tail pages */ }; union { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* SLUB: freelist req. slab lock */ }; struct list_head lru; /* Pageout list, eg. active_list * protected by zone->lru_lock ! */ /* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with * highmem some memory is mapped into kernel virtual memory * dynamically, so we need a place to store that address. * Note that this field could be 16 bits on x86 ... ;) * * Architectures with slow multiplication can define * WANT_PAGE_VIRTUAL in asm/page.h */#if defined(WANT_PAGE_VIRTUAL) void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */#endif /* WANT_PAGE_VIRTUAL */#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS unsigned long debug_flags; /* Use atomic bitops on this */#endif#ifdef CONFIG_KMEMCHECK /* * kmemcheck wants to track the status of each byte in a page; this * is a pointer to such a status block. NULL if not tracked. */ void *shadow;#endif};
4、内存分配策略
4.1、分配策略(决定从哪个node分配)
// 详情参见:http://blog.sina.com.cn/s/blog_6bd2fa790102znpt.html// 分别是 交叉内存分配、优先在某节点分配、指定节点分配、(本地内存分配?)// 1.交叉(interleave):在所有节点或者指定的节点上以RR(Round Robin 轮询调度)算法交织地请求分配内存;// 2.优先(preferred):在指定节点上分配,失败则在其他节点上分配。// 3.绑定(membind):强制分配到指定节点上;// 4.缺省(default, localalloc):总是在本地节点分配(分配在当前进程运行的节点上);(todo : 本地节点分配当内存不足时会怎样?如果进程运行在多个节点上时,会如何分配?)enum { MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND, MPOL_INTERLEAVE, MPOL_MAX,};/* Flags for set_mempolicy */#define MPOL_F_STATIC_NODES (1 << 15)#define MPOL_F_RELATIVE_NODES (1 << 14)/* * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to * either set_mempolicy() or mbind(). */#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)/* Flags for get_mempolicy */#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */#define MPOL_F_ADDR (1<<1) /* look up vma using address */#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories *//* Flags for mbind */#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here *//* * Internal flags that share the struct mempolicy flags word with * "mode flags". These flags are allocated from bit 0 up, as they * are never OR'ed into the mode in mempolicy API arguments. */#define MPOL_F_SHARED (1 << 0) /* identify shared policies */#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
4.2、分配掩码(决定从哪个zone分配内存)
// 分配掩码// 默认从Normal分配内存// 详情参见:https://www.cnblogs.com/arnoldlu/p/8250734.html// https://blog.csdn.net/farmwang/article/details/66975128#define __GFP_DMA ((__force gfp_t)0x01u)#define __GFP_HIGHMEM ((__force gfp_t)0x02u)#define __GFP_DMA32 ((__force gfp_t)0x04u)#define __GFP_MOVABLE ((__force gfp_t)0x08u)#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
4.3、分配属性(决定分配过程中的行为)
// 操作类型掩码// 详情参见:https://www.cnblogs.com/arnoldlu/p/8250734.html#define __GFP_WAIT ((__force gfp_t)0x10u) /* Can wait and reschedule? */#define __GFP_HIGH ((__force gfp_t)0x20u) /* Should access emergency pools? */#define __GFP_IO ((__force gfp_t)0x40u) /* Can start physical IO? */#define __GFP_FS ((__force gfp_t)0x80u) /* Can call down to low-level FS? */#define __GFP_COLD ((__force gfp_t)0x100u) /* Cache-cold page required */#define __GFP_NOWARN ((__force gfp_t)0x200u) /* Suppress page allocation failure warning */#define __GFP_REPEAT ((__force gfp_t)0x400u) /* See above */#define __GFP_NOFAIL ((__force gfp_t)0x800u) /* See above */#define __GFP_NORETRY ((__force gfp_t)0x1000u)/* See above */#define __GFP_COMP ((__force gfp_t)0x4000u)/* Add compound page metadata */#define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */#define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */#define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */#define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */#define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
4.4、内存水位相关标志位
// 内存分配水位相关标志#define ALLOC_WMARK_MIN WMARK_MIN#define ALLOC_WMARK_LOW WMARK_LOW#define ALLOC_WMARK_HIGH WMARK_HIGH#define ALLOC_NO_WATERMARKS 0x04#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)// 详情参见:https://www.codeleading.com/article/59964929993// 对于ALLOC_HARDER来说它把watermark水位减去了1/4// 对于ALLOC_HIGH来说,它把保留水位值减少了1/2// 这两个标记按照不同程度来减少保留内存数量,从而达到了更容易成功申请内存的目的// 从这里也可以看出ALLOC_HIGH是比ALLOC_HARDER更加激进的内存申请方式。#define ALLOC_HARDER 0x10#define ALLOC_HIGH 0x20#define ALLOC_CPUSET 0x40