1、struct pglist_data(伙伴系统顶级结构体)
typedef struct pglist_data {
// 64位系统支持DMA/DMA_32/NORMAL
struct zone node_zones[MAX_NR_ZONES];
// MAX_ZONELISTS为2
// [0] -> Zonelist with fallback
// [1] -> No fallback (GFP_THISNODE)
struct zonelist node_zonelists[MAX_ZONELISTS];
// zone的个数
int nr_zones;
// 按照最初的版本,page存放在一个大数组里面
struct page *node_mem_map;
struct page_cgroup *node_page_cgroup;
// bootmem内存分配器
struct bootmem_data *bdata;
// 支持热插拔的锁
spinlock_t node_size_lock;
// 该节点起始页框号
unsigned long node_start_pfn;
// 物理页面的数量
unsigned long node_present_pages;
// 物理页面的大小,包括空洞
unsigned long node_spanned_pages;
// 节点ID
int node_id;
// kswapd进程唤醒队列
wait_queue_head_t kswapd_wait;
// kswapd进程task_struct
struct task_struct *kswapd;
// 需要回收的页面的最大大小
int kswapd_max_order;
} pg_data_t;
/*
* One allocation request operates on a zonelist. A zonelist
* is a list of zones, the first one is the 'goal' of the
* allocation, the other zones are fallback zones, in decreasing
* priority.
*
* If zlcache_ptr is not NULL, then it is just the address of zlcache,
* as explained above. If zlcache_ptr is NULL, there is no zlcache.
* *
* To speed the reading of the zonelist, the zonerefs contain the zone index
* of the entry being read. Helper functions to access information given
* a struct zoneref are
*
* zonelist_zone() - Return the struct zone * for an entry in _zonerefs
* zonelist_zone_idx() - Return the index of the zone for an entry
* zonelist_node_idx() - Return the index of the node for an entry
*/
struct zonelist {
struct zonelist_cache *zlcache_ptr; // NULL or &zlcache
struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
struct zonelist_cache zlcache; // optional ...
};
/*
* This struct contains information about a zone in a zonelist. It is stored
* here to avoid dereferences into large structures and lookups of tables
*/
struct zoneref {
struct zone *zone; /* Pointer to actual zone */
int zone_idx; /* zone_idx(zoneref->zone) */
};
/*
* We cache key information from each zonelist for smaller cache
* footprint when scanning for free pages in get_page_from_freelist().
*
* 1) The BITMAP fullzones tracks which zones in a zonelist have come
* up short of free memory since the last time (last_fullzone_zap)
* we zero'd fullzones.
* 2) The array z_to_n[] maps each zone in the zonelist to its node
* id, so that we can efficiently evaluate whether that node is
* set in the current tasks mems_allowed.
*
* Both fullzones and z_to_n[] are one-to-one with the zonelist,
* indexed by a zones offset in the zonelist zones[] array.
*
* The get_page_from_freelist() routine does two scans. During the
* first scan, we skip zones whose corresponding bit in 'fullzones'
* is set or whose corresponding node in current->mems_allowed (which
* comes from cpusets) is not set. During the second scan, we bypass
* this zonelist_cache, to ensure we look methodically at each zone.
*
* Once per second, we zero out (zap) fullzones, forcing us to
* reconsider nodes that might have regained more free memory.
* The field last_full_zap is the time we last zapped fullzones.
*
* This mechanism reduces the amount of time we waste repeatedly
* reexaming zones for free memory when they just came up low on
* memory momentarilly ago.
*
* The zonelist_cache struct members logically belong in struct
* zonelist. However, the mempolicy zonelists constructed for
* MPOL_BIND are intentionally variable length (and usually much
* shorter). A general purpose mechanism for handling structs with
* multiple variable length members is more mechanism than we want
* here. We resort to some special case hackery instead.
*
* The MPOL_BIND zonelists don't need this zonelist_cache (in good
* part because they are shorter), so we put the fixed length stuff
* at the front of the zonelist struct, ending in a variable length
* zones[], as is needed by MPOL_BIND.
*
* Then we put the optional zonelist cache on the end of the zonelist
* struct. This optional stuff is found by a 'zlcache_ptr' pointer in
* the fixed length portion at the front of the struct. This pointer
* both enables us to find the zonelist cache, and in the case of
* MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL)
* to know that the zonelist cache is not there.
*
* The end result is that struct zonelists come in two flavors:
* 1) The full, fixed length version, shown below, and
* 2) The custom zonelists for MPOL_BIND.
* The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache.
*
* Even though there may be multiple CPU cores on a node modifying
* fullzones or last_full_zap in the same zonelist_cache at the same
* time, we don't lock it. This is just hint data - if it is wrong now
* and then, the allocator will still function, perhaps a bit slower.
*/
struct zonelist_cache {
unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */
DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */
unsigned long last_full_zap; /* when last zap'd (jiffies) */
};
2、struct zone_struct(维护每个zone的内存)
// 每个节点的内存分为3部分:
// ZONE_DMA < 16 MB
// ZONE_NORMAL 16-896 MB,直接按照PAGE_OFFSET进行直接映射
// ZONE_HIGHMEM > 896 MB only page cache and user processes
// ZONE_HIGHMEM也分为很多段,此处不细讲
#define ZONE_DMA 0
#define ZONE_NORMAL 1
#define ZONE_HIGHMEM 2
#define MAX_NR_ZONES 3
#define ZONES_SHIFT 2
#define GFP_ZONEMASK 0x03
struct zone {
spinlock_t lock;
// 有多少空闲页面
unsigned long free_pages;
// 管理区中保留页的数目
unsigned long pages_min;
// 回收页框使用的下界,同时也被管理区分配器作为阀值使用,一般这个数字是pages_min的5/4
unsigned long pages_low;
// 回收页框使用的上界,同时也被管理区分配器作为阀值使用,一般这个数字是pages_min的3/2
unsigned long pages_high;
ZONE_PADDING(_pad1_)
// 页面LRU相关变量
spinlock_t lru_lock;
struct list_head active_list;
struct list_head inactive_list;
unsigned long nr_active;
unsigned long nr_inactive;
atomic_t refill_counter;
int all_unreclaimable; /* All pages pinned */
unsigned long pages_scanned; /* since last reclaim */
ZONE_PADDING(_pad2_)
/*
* prev_priority holds the scanning priority for this zone. It is
* defined as the scanning priority at which we achieved our reclaim
* target at the previous try_to_free_pages() or balance_pgdat()
* invokation.
*
* We use prev_priority as a measure of how much stress page reclaim is
* under - it drives the swappiness decision: whether to unmap mapped
* pages.
*
* temp_priority is used to remember the scanning priority at which
* this zone was successfully refilled to free_pages == pages_high.
*
* Access to both these fields is quite racy even on uniprocessor. But
* it is expected to average out OK.
*/
int temp_priority;
int prev_priority;
// 按照不同order组成的多个链表
struct free_area free_area[MAX_ORDER];
/*
* wait_table -- the array holding the hash table
* wait_table_size -- the size of the hash table array
* wait_table_bits -- wait_table_size == (1 << wait_table_bits)
*
* The purpose of all these is to keep track of the people
* waiting for a page to become available and make them
* runnable again when possible. The trouble is that this
* consumes a lot of space, especially when so few things
* wait on pages at a given time. So instead of using
* per-page waitqueues, we use a waitqueue hash table.
*
* The bucket discipline is to sleep on the same queue when
* colliding and wake all in that wait queue when removing.
* When something wakes, it must check to be sure its page is
* truly available, a la thundering herd. The cost of a
* collision is great, but given the expected load of the
* table, they should be so rare as to be outweighed by the
* benefits from the saved space.
*
* __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
* primary users of these fields, and in mm/page_alloc.c
* free_area_init_core() performs the initialization of them.
*/
// 进程等待队列,这些进程在等待管理区中的某页
wait_queue_head_t * wait_table;
// 等待队列的大小
unsigned long wait_table_size;
unsigned long wait_table_bits;
ZONE_PADDING(_pad3_)
// 每个CPU的单框页面缓存
// 非NUMA结构忽略
struct per_cpu_pageset pageset[NR_CPUS];
// 伙伴系统顶级结构体
struct pglist_data *zone_pgdat;
// 属于该zone的页面数组地址
struct page *zone_mem_map;
// 起始页框号
// zone_start_pfn == zone_start_paddr >> PAGE_SHIFT
unsigned long zone_start_pfn;
// zone名称
char *name;
// zone的总大小,包含空洞
unsigned long spanned_pages;
// zone的总页数,不包括空洞
unsigned long present_pages;
} ____cacheline_maxaligned_in_smp;
// zone的不同水位
enum zone_watermarks {
WMARK_MIN,
WMARK_LOW,
WMARK_HIGH,
NR_WMARK
};
struct zone {
unsigned long watermark[NR_WMARK];
/*
* We don't know if the memory that we're going to allocate will be freeable
* or/and it will be released eventually, so to avoid totally wasting several
* GB of ram we must reserve some of the lower zone memory (otherwise we risk
* to run OOM on the lower zones despite there's tons of freeable ram
* on the higher zones). This array is recalculated at runtime if the
* sysctl_lowmem_reserve_ratio sysctl changes.
*/
unsigned long lowmem_reserve[MAX_NR_ZONES];
// node ID
int node;
/*
* zone reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
struct per_cpu_pageset __percpu *pageset;
/*
* free areas of different sizes
*/
spinlock_t lock;
int all_unreclaimable; /* All pages pinned */
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
struct free_area free_area[MAX_ORDER];
unsigned long *pageblock_flags;
ZONE_PADDING(_pad1_)
/* Fields commonly accessed by the page reclaim scanner */
spinlock_t lru_lock;
struct zone_lru {
struct list_head list;
} lru[NR_LRU_LISTS];
struct zone_reclaim_stat reclaim_stat;
unsigned long pages_scanned; /* since last reclaim */
unsigned long flags; /* zone flags, see below */
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
/*
* prev_priority holds the scanning priority for this zone. It is
* defined as the scanning priority at which we achieved our reclaim
* target at the previous try_to_free_pages() or balance_pgdat()
* invocation.
*
* We use prev_priority as a measure of how much stress page reclaim is
* under - it drives the swappiness decision: whether to unmap mapped
* pages.
*
* Access to both this field is quite racy even on uniprocessor. But
* it is expected to average out OK.
*/
int prev_priority;
/*
* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
* this zone's LRU. Maintained by the pageout code.
*/
unsigned int inactive_ratio;
ZONE_PADDING(_pad2_)
/* Rarely used or read-mostly fields */
/*
* wait_table -- the array holding the hash table
* wait_table_hash_nr_entries -- the size of the hash table array
* wait_table_bits -- wait_table_size == (1 << wait_table_bits)
*
* The purpose of all these is to keep track of the people
* waiting for a page to become available and make them
* runnable again when possible. The trouble is that this
* consumes a lot of space, especially when so few things
* wait on pages at a given time. So instead of using
* per-page waitqueues, we use a waitqueue hash table.
*
* The bucket discipline is to sleep on the same queue when
* colliding and wake all in that wait queue when removing.
* When something wakes, it must check to be sure its page is
* truly available, a la thundering herd. The cost of a
* collision is great, but given the expected load of the
* table, they should be so rare as to be outweighed by the
* benefits from the saved space.
*
* __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
* primary users of these fields, and in mm/page_alloc.c
* free_area_init_core() performs the initialization of them.
*/
wait_queue_head_t * wait_table;
unsigned long wait_table_hash_nr_entries;
unsigned long wait_table_bits;
/*
* Discontig memory support fields.
*/
struct pglist_data *zone_pgdat;
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
/*
* zone_start_pfn, spanned_pages and present_pages are all
* protected by span_seqlock. It is a seqlock because it has
* to be read outside of zone->lock, and it is done in the main
* allocator path. But, it is written quite infrequently.
*
* The lock is declared along with zone->lock because it is
* frequently read in proximity to zone->lock. It's good to
* give them a chance of being in the same cacheline.
*/
unsigned long spanned_pages; /* total size, including holes */
unsigned long present_pages; /* amount of memory (excluding holes) */
/*
* rarely used fields:
*/
const char *name;
} ____cacheline_internodealigned_in_smp;
3、struct page(每个物理页面的描述)
// 每一个物理页在OS里面会有一个struct page结构体
// 此结构体维护了一个物理页的状态信息
struct page {
unsigned long flags; // Atomic flags, some possibly updated asynchronously
atomic_t _count; // Usage count, see below
union {
atomic_t _mapcount; /* Count of ptes mapped in mms,
* to show when page is mapped
* & limit reverse map searches.
*/
struct { /* SLUB */
u16 inuse;
u16 objects;
};
};
union {
struct {
unsigned long private; /* Mapping-private opaque data:
* usually used for buffer_heads
* if PagePrivate set; used for
* swp_entry_t if PageSwapCache;
* indicates order in the buddy
* system if PG_buddy is set.
*/
struct address_space *mapping; /* If low bit clear, points to
* inode address_space, or NULL.
* If page mapped as anonymous
* memory, low bit is set, and
* it points to anon_vma object:
* see PAGE_MAPPING_ANON below.
*/
};
#if USE_SPLIT_PTLOCKS
spinlock_t ptl;
#endif
struct kmem_cache *slab; /* SLUB: Pointer to slab */
struct page *first_page; /* Compound tail pages */
};
union {
pgoff_t index; /* Our offset within mapping. */
void *freelist; /* SLUB: freelist req. slab lock */
};
struct list_head lru; /* Pageout list, eg. active_list
* protected by zone->lru_lock !
*/
/*
* On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with
* highmem some memory is mapped into kernel virtual memory
* dynamically, so we need a place to store that address.
* Note that this field could be 16 bits on x86 ... ;)
*
* Architectures with slow multiplication can define
* WANT_PAGE_VIRTUAL in asm/page.h
*/
#if defined(WANT_PAGE_VIRTUAL)
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
unsigned long debug_flags; /* Use atomic bitops on this */
#endif
#ifdef CONFIG_KMEMCHECK
/*
* kmemcheck wants to track the status of each byte in a page; this
* is a pointer to such a status block. NULL if not tracked.
*/
void *shadow;
#endif
};
4、内存分配策略
4.1、分配策略(决定从哪个node分配)
// 详情参见:http://blog.sina.com.cn/s/blog_6bd2fa790102znpt.html
// 分别是 交叉内存分配、优先在某节点分配、指定节点分配、(本地内存分配?)
// 1.交叉(interleave):在所有节点或者指定的节点上以RR(Round Robin 轮询调度)算法交织地请求分配内存;
// 2.优先(preferred):在指定节点上分配,失败则在其他节点上分配。
// 3.绑定(membind):强制分配到指定节点上;
// 4.缺省(default, localalloc):总是在本地节点分配(分配在当前进程运行的节点上);
(todo : 本地节点分配当内存不足时会怎样?如果进程运行在多个节点上时,会如何分配?)
enum {
MPOL_DEFAULT,
MPOL_PREFERRED,
MPOL_BIND,
MPOL_INTERLEAVE,
MPOL_MAX,
};
/* Flags for set_mempolicy */
#define MPOL_F_STATIC_NODES (1 << 15)
#define MPOL_F_RELATIVE_NODES (1 << 14)
/*
* MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
* either set_mempolicy() or mbind().
*/
#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
/* Flags for get_mempolicy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
#define MPOL_F_ADDR (1<<1) /* look up vma using address */
#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
/* Flags for mbind */
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */
#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */
/*
* Internal flags that share the struct mempolicy flags word with
* "mode flags". These flags are allocated from bit 0 up, as they
* are never OR'ed into the mode in mempolicy API arguments.
*/
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
4.2、分配掩码(决定从哪个zone分配内存)
// 分配掩码
// 默认从Normal分配内存
// 详情参见:https://www.cnblogs.com/arnoldlu/p/8250734.html
// https://blog.csdn.net/farmwang/article/details/66975128
#define __GFP_DMA ((__force gfp_t)0x01u)
#define __GFP_HIGHMEM ((__force gfp_t)0x02u)
#define __GFP_DMA32 ((__force gfp_t)0x04u)
#define __GFP_MOVABLE ((__force gfp_t)0x08u)
#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
4.3、分配属性(决定分配过程中的行为)
// 操作类型掩码
// 详情参见:https://www.cnblogs.com/arnoldlu/p/8250734.html
#define __GFP_WAIT ((__force gfp_t)0x10u) /* Can wait and reschedule? */
#define __GFP_HIGH ((__force gfp_t)0x20u) /* Should access emergency pools? */
#define __GFP_IO ((__force gfp_t)0x40u) /* Can start physical IO? */
#define __GFP_FS ((__force gfp_t)0x80u) /* Can call down to low-level FS? */
#define __GFP_COLD ((__force gfp_t)0x100u) /* Cache-cold page required */
#define __GFP_NOWARN ((__force gfp_t)0x200u) /* Suppress page allocation failure warning */
#define __GFP_REPEAT ((__force gfp_t)0x400u) /* See above */
#define __GFP_NOFAIL ((__force gfp_t)0x800u) /* See above */
#define __GFP_NORETRY ((__force gfp_t)0x1000u)/* See above */
#define __GFP_COMP ((__force gfp_t)0x4000u)/* Add compound page metadata */
#define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */
#define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
#define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
#define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */
#define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
4.4、内存水位相关标志位
// 内存分配水位相关标志
#define ALLOC_WMARK_MIN WMARK_MIN
#define ALLOC_WMARK_LOW WMARK_LOW
#define ALLOC_WMARK_HIGH WMARK_HIGH
#define ALLOC_NO_WATERMARKS 0x04
#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
// 详情参见:https://www.codeleading.com/article/59964929993
// 对于ALLOC_HARDER来说它把watermark水位减去了1/4
// 对于ALLOC_HIGH来说,它把保留水位值减少了1/2
// 这两个标记按照不同程度来减少保留内存数量,从而达到了更容易成功申请内存的目的
// 从这里也可以看出ALLOC_HIGH是比ALLOC_HARDER更加激进的内存申请方式。
#define ALLOC_HARDER 0x10
#define ALLOC_HIGH 0x20
#define ALLOC_CPUSET 0x40