1、struct pglist_data（伙伴系统顶级结构体）

typedef struct pglist_data {
    // 64位系统支持DMA/DMA_32/NORMAL
    struct zone node_zones[MAX_NR_ZONES];
    // MAX_ZONELISTS为2
    // [0] -> Zonelist with fallback
    // [1] -> No fallback (GFP_THISNODE)
    struct zonelist node_zonelists[MAX_ZONELISTS];
    // zone的个数
    int nr_zones;
    // 按照最初的版本，page存放在一个大数组里面
    struct page *node_mem_map;
    struct page_cgroup *node_page_cgroup;
    // bootmem内存分配器
    struct bootmem_data *bdata;
    // 支持热插拔的锁
    spinlock_t node_size_lock;
    // 该节点起始页框号
    unsigned long node_start_pfn;
    // 物理页面的数量
    unsigned long node_present_pages;
    // 物理页面的大小，包括空洞
    unsigned long node_spanned_pages;
    // 节点ID
    int node_id;
    // kswapd进程唤醒队列
    wait_queue_head_t kswapd_wait;
    // kswapd进程task_struct
    struct task_struct *kswapd;
    // 需要回收的页面的最大大小
    int kswapd_max_order;
} pg_data_t;
/*
 * One allocation request operates on a zonelist. A zonelist
 * is a list of zones, the first one is the 'goal' of the
 * allocation, the other zones are fallback zones, in decreasing
 * priority.
 *
 * If zlcache_ptr is not NULL, then it is just the address of zlcache,
 * as explained above.  If zlcache_ptr is NULL, there is no zlcache.
 * *
 * To speed the reading of the zonelist, the zonerefs contain the zone index
 * of the entry being read. Helper functions to access information given
 * a struct zoneref are
 *
 * zonelist_zone()    - Return the struct zone * for an entry in _zonerefs
 * zonelist_zone_idx()    - Return the index of the zone for an entry
 * zonelist_node_idx()    - Return the index of the node for an entry
 */
struct zonelist {
    struct zonelist_cache *zlcache_ptr;             // NULL or &zlcache
    struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
    struct zonelist_cache zlcache;                 // optional ...
};
/*
 * This struct contains information about a zone in a zonelist. It is stored
 * here to avoid dereferences into large structures and lookups of tables
 */
struct zoneref {
    struct zone *zone;    /* Pointer to actual zone */
    int zone_idx;        /* zone_idx(zoneref->zone) */
};
/*
 * We cache key information from each zonelist for smaller cache
 * footprint when scanning for free pages in get_page_from_freelist().
 *
 * 1) The BITMAP fullzones tracks which zones in a zonelist have come
 *    up short of free memory since the last time (last_fullzone_zap)
 *    we zero'd fullzones.
 * 2) The array z_to_n[] maps each zone in the zonelist to its node
 *    id, so that we can efficiently evaluate whether that node is
 *    set in the current tasks mems_allowed.
 *
 * Both fullzones and z_to_n[] are one-to-one with the zonelist,
 * indexed by a zones offset in the zonelist zones[] array.
 *
 * The get_page_from_freelist() routine does two scans.  During the
 * first scan, we skip zones whose corresponding bit in 'fullzones'
 * is set or whose corresponding node in current->mems_allowed (which
 * comes from cpusets) is not set.  During the second scan, we bypass
 * this zonelist_cache, to ensure we look methodically at each zone.
 *
 * Once per second, we zero out (zap) fullzones, forcing us to
 * reconsider nodes that might have regained more free memory.
 * The field last_full_zap is the time we last zapped fullzones.
 *
 * This mechanism reduces the amount of time we waste repeatedly
 * reexaming zones for free memory when they just came up low on
 * memory momentarilly ago.
 *
 * The zonelist_cache struct members logically belong in struct
 * zonelist.  However, the mempolicy zonelists constructed for
 * MPOL_BIND are intentionally variable length (and usually much
 * shorter).  A general purpose mechanism for handling structs with
 * multiple variable length members is more mechanism than we want
 * here.  We resort to some special case hackery instead.
 *
 * The MPOL_BIND zonelists don't need this zonelist_cache (in good
 * part because they are shorter), so we put the fixed length stuff
 * at the front of the zonelist struct, ending in a variable length
 * zones[], as is needed by MPOL_BIND.
 *
 * Then we put the optional zonelist cache on the end of the zonelist
 * struct.  This optional stuff is found by a 'zlcache_ptr' pointer in
 * the fixed length portion at the front of the struct.  This pointer
 * both enables us to find the zonelist cache, and in the case of
 * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL)
 * to know that the zonelist cache is not there.
 *
 * The end result is that struct zonelists come in two flavors:
 *  1) The full, fixed length version, shown below, and
 *  2) The custom zonelists for MPOL_BIND.
 * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache.
 *
 * Even though there may be multiple CPU cores on a node modifying
 * fullzones or last_full_zap in the same zonelist_cache at the same
 * time, we don't lock it.  This is just hint data - if it is wrong now
 * and then, the allocator will still function, perhaps a bit slower.
 */
struct zonelist_cache {
    unsigned short z_to_n[MAX_ZONES_PER_ZONELIST];        /* zone->nid */
    DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST);    /* zone full? */
    unsigned long last_full_zap;        /* when last zap'd (jiffies) */
};

2、struct zone_struct（维护每个zone的内存）

// 每个节点的内存分为3部分：
// ZONE_DMA      < 16 MB
// ZONE_NORMAL    16-896 MB，直接按照PAGE_OFFSET进行直接映射
// ZONE_HIGHMEM     > 896 MB only page cache and user processes
// ZONE_HIGHMEM也分为很多段，此处不细讲
#define ZONE_DMA        0
#define ZONE_NORMAL        1
#define ZONE_HIGHMEM        2
#define MAX_NR_ZONES        3
#define ZONES_SHIFT        2
#define GFP_ZONEMASK    0x03
struct zone {
    spinlock_t        lock;
    // 有多少空闲页面
    unsigned long        free_pages;
    // 管理区中保留页的数目
    unsigned long        pages_min;
    // 回收页框使用的下界，同时也被管理区分配器作为阀值使用，一般这个数字是pages_min的5/4
    unsigned long        pages_low;
    // 回收页框使用的上界，同时也被管理区分配器作为阀值使用，一般这个数字是pages_min的3/2
    unsigned long        pages_high;
    ZONE_PADDING(_pad1_)
    // 页面LRU相关变量
    spinlock_t        lru_lock;    
    struct list_head    active_list;
    struct list_head    inactive_list;
    unsigned long        nr_active;
    unsigned long        nr_inactive;
    atomic_t        refill_counter;
    int            all_unreclaimable; /* All pages pinned */
    unsigned long        pages_scanned;       /* since last reclaim */
    ZONE_PADDING(_pad2_)
    /*
     * prev_priority holds the scanning priority for this zone.  It is
     * defined as the scanning priority at which we achieved our reclaim
     * target at the previous try_to_free_pages() or balance_pgdat()
     * invokation.
     *
     * We use prev_priority as a measure of how much stress page reclaim is
     * under - it drives the swappiness decision: whether to unmap mapped
     * pages.
     *
     * temp_priority is used to remember the scanning priority at which
     * this zone was successfully refilled to free_pages == pages_high.
     *
     * Access to both these fields is quite racy even on uniprocessor.  But
     * it is expected to average out OK.
     */
    int temp_priority;
    int prev_priority;
    // 按照不同order组成的多个链表
    struct free_area    free_area[MAX_ORDER];
    /*
     * wait_table        -- the array holding the hash table
     * wait_table_size    -- the size of the hash table array
     * wait_table_bits    -- wait_table_size == (1 << wait_table_bits)
     *
     * The purpose of all these is to keep track of the people
     * waiting for a page to become available and make them
     * runnable again when possible. The trouble is that this
     * consumes a lot of space, especially when so few things
     * wait on pages at a given time. So instead of using
     * per-page waitqueues, we use a waitqueue hash table.
     *
     * The bucket discipline is to sleep on the same queue when
     * colliding and wake all in that wait queue when removing.
     * When something wakes, it must check to be sure its page is
     * truly available, a la thundering herd. The cost of a
     * collision is great, but given the expected load of the
     * table, they should be so rare as to be outweighed by the
     * benefits from the saved space.
     *
     * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
     * primary users of these fields, and in mm/page_alloc.c
     * free_area_init_core() performs the initialization of them.
     */
    // 进程等待队列，这些进程在等待管理区中的某页
    wait_queue_head_t    * wait_table;
    // 等待队列的大小
    unsigned long        wait_table_size;
    unsigned long        wait_table_bits;
    ZONE_PADDING(_pad3_)
    // 每个CPU的单框页面缓存
    // 非NUMA结构忽略
    struct per_cpu_pageset    pageset[NR_CPUS];
    // 伙伴系统顶级结构体
    struct pglist_data    *zone_pgdat;
    // 属于该zone的页面数组地址
    struct page        *zone_mem_map;
    // 起始页框号
    // zone_start_pfn == zone_start_paddr >> PAGE_SHIFT
    unsigned long        zone_start_pfn;
    // zone名称
    char            *name;
    // zone的总大小，包含空洞
    unsigned long        spanned_pages;
    // zone的总页数，不包括空洞
    unsigned long        present_pages;
} ____cacheline_maxaligned_in_smp;
// zone的不同水位
enum zone_watermarks {
    WMARK_MIN,
    WMARK_LOW,
    WMARK_HIGH,
    NR_WMARK
};
struct zone {
    unsigned long watermark[NR_WMARK];
    /*
     * We don't know if the memory that we're going to allocate will be freeable
     * or/and it will be released eventually, so to avoid totally wasting several
     * GB of ram we must reserve some of the lower zone memory (otherwise we risk
     * to run OOM on the lower zones despite there's tons of freeable ram
     * on the higher zones). This array is recalculated at runtime if the
     * sysctl_lowmem_reserve_ratio sysctl changes.
     */
    unsigned long        lowmem_reserve[MAX_NR_ZONES];
    // node ID
    int node;
    /*
     * zone reclaim becomes active if more unmapped pages exist.
     */
    unsigned long        min_unmapped_pages;
    unsigned long        min_slab_pages;
    struct per_cpu_pageset __percpu *pageset;
    /*
     * free areas of different sizes
     */
    spinlock_t        lock;
    int                     all_unreclaimable; /* All pages pinned */
    /* see spanned/present_pages for more description */
    seqlock_t        span_seqlock;
    struct free_area    free_area[MAX_ORDER];
    unsigned long        *pageblock_flags;
    ZONE_PADDING(_pad1_)
    /* Fields commonly accessed by the page reclaim scanner */
    spinlock_t        lru_lock;    
    struct zone_lru {
        struct list_head list;
    } lru[NR_LRU_LISTS];
    struct zone_reclaim_stat reclaim_stat;
    unsigned long        pages_scanned;       /* since last reclaim */
    unsigned long        flags;           /* zone flags, see below */
    /* Zone statistics */
    atomic_long_t        vm_stat[NR_VM_ZONE_STAT_ITEMS];
    /*
     * prev_priority holds the scanning priority for this zone.  It is
     * defined as the scanning priority at which we achieved our reclaim
     * target at the previous try_to_free_pages() or balance_pgdat()
     * invocation.
     *
     * We use prev_priority as a measure of how much stress page reclaim is
     * under - it drives the swappiness decision: whether to unmap mapped
     * pages.
     *
     * Access to both this field is quite racy even on uniprocessor.  But
     * it is expected to average out OK.
     */
    int prev_priority;
    /*
     * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
     * this zone's LRU.  Maintained by the pageout code.
     */
    unsigned int inactive_ratio;
    ZONE_PADDING(_pad2_)
    /* Rarely used or read-mostly fields */
    /*
     * wait_table        -- the array holding the hash table
     * wait_table_hash_nr_entries    -- the size of the hash table array
     * wait_table_bits    -- wait_table_size == (1 << wait_table_bits)
     *
     * The purpose of all these is to keep track of the people
     * waiting for a page to become available and make them
     * runnable again when possible. The trouble is that this
     * consumes a lot of space, especially when so few things
     * wait on pages at a given time. So instead of using
     * per-page waitqueues, we use a waitqueue hash table.
     *
     * The bucket discipline is to sleep on the same queue when
     * colliding and wake all in that wait queue when removing.
     * When something wakes, it must check to be sure its page is
     * truly available, a la thundering herd. The cost of a
     * collision is great, but given the expected load of the
     * table, they should be so rare as to be outweighed by the
     * benefits from the saved space.
     *
     * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
     * primary users of these fields, and in mm/page_alloc.c
     * free_area_init_core() performs the initialization of them.
     */
    wait_queue_head_t    * wait_table;
    unsigned long        wait_table_hash_nr_entries;
    unsigned long        wait_table_bits;
    /*
     * Discontig memory support fields.
     */
    struct pglist_data    *zone_pgdat;
    /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
    unsigned long        zone_start_pfn;
    /*
     * zone_start_pfn, spanned_pages and present_pages are all
     * protected by span_seqlock.  It is a seqlock because it has
     * to be read outside of zone->lock, and it is done in the main
     * allocator path.  But, it is written quite infrequently.
     *
     * The lock is declared along with zone->lock because it is
     * frequently read in proximity to zone->lock.  It's good to
     * give them a chance of being in the same cacheline.
     */
    unsigned long        spanned_pages;    /* total size, including holes */
    unsigned long        present_pages;    /* amount of memory (excluding holes) */
    /*
     * rarely used fields:
     */
    const char        *name;
} ____cacheline_internodealigned_in_smp;

3、struct page（每个物理页面的描述）

// 每一个物理页在OS里面会有一个struct page结构体
// 此结构体维护了一个物理页的状态信息
struct page {
    unsigned long flags;    // Atomic flags, some possibly updated asynchronously
    atomic_t _count;        // Usage count, see below
    union {
        atomic_t _mapcount;    /* Count of ptes mapped in mms,
                     * to show when page is mapped
                     * & limit reverse map searches.
                     */
        struct {        /* SLUB */
            u16 inuse;
            u16 objects;
        };
    };
    union {
        struct {
        unsigned long private;        /* Mapping-private opaque data:
                          * usually used for buffer_heads
                         * if PagePrivate set; used for
                         * swp_entry_t if PageSwapCache;
                         * indicates order in the buddy
                         * system if PG_buddy is set.
                         */
        struct address_space *mapping;    /* If low bit clear, points to
                         * inode address_space, or NULL.
                         * If page mapped as anonymous
                         * memory, low bit is set, and
                         * it points to anon_vma object:
                         * see PAGE_MAPPING_ANON below.
                         */
        };
#if USE_SPLIT_PTLOCKS
        spinlock_t ptl;
#endif
        struct kmem_cache *slab;    /* SLUB: Pointer to slab */
        struct page *first_page;    /* Compound tail pages */
    };
    union {
        pgoff_t index;        /* Our offset within mapping. */
        void *freelist;        /* SLUB: freelist req. slab lock */
    };
    struct list_head lru;        /* Pageout list, eg. active_list
                     * protected by zone->lru_lock !
                     */
    /*
     * On machines where all RAM is mapped into kernel address space,
     * we can simply calculate the virtual address. On machines with
     * highmem some memory is mapped into kernel virtual memory
     * dynamically, so we need a place to store that address.
     * Note that this field could be 16 bits on x86 ... ;)
     *
     * Architectures with slow multiplication can define
     * WANT_PAGE_VIRTUAL in asm/page.h
     */
#if defined(WANT_PAGE_VIRTUAL)
    void *virtual;            /* Kernel virtual address (NULL if
                       not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
    unsigned long debug_flags;    /* Use atomic bitops on this */
#endif
#ifdef CONFIG_KMEMCHECK
    /*
     * kmemcheck wants to track the status of each byte in a page; this
     * is a pointer to such a status block. NULL if not tracked.
     */
    void *shadow;
#endif
};

4、内存分配策略

4.1、分配策略（决定从哪个node分配）

// 详情参见：http://blog.sina.com.cn/s/blog_6bd2fa790102znpt.html
// 分别是 交叉内存分配、优先在某节点分配、指定节点分配、（本地内存分配？）
// 1.交叉(interleave)：在所有节点或者指定的节点上以RR（Round Robin 轮询调度）算法交织地请求分配内存；
// 2.优先(preferred)：在指定节点上分配，失败则在其他节点上分配。
// 3.绑定(membind)：强制分配到指定节点上；
// 4.缺省(default, localalloc)：总是在本地节点分配（分配在当前进程运行的节点上）；
（todo : 本地节点分配当内存不足时会怎样？如果进程运行在多个节点上时，会如何分配？）
enum {
    MPOL_DEFAULT,
    MPOL_PREFERRED,
    MPOL_BIND,
    MPOL_INTERLEAVE,
    MPOL_MAX,
};
/* Flags for set_mempolicy */
#define MPOL_F_STATIC_NODES    (1 << 15)
#define MPOL_F_RELATIVE_NODES    (1 << 14)
/*
 * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
 * either set_mempolicy() or mbind().
 */
#define MPOL_MODE_FLAGS    (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
/* Flags for get_mempolicy */
#define MPOL_F_NODE    (1<<0)    /* return next IL mode instead of node mask */
#define MPOL_F_ADDR    (1<<1)    /* look up vma using address */
#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
/* Flags for mbind */
#define MPOL_MF_STRICT    (1<<0)    /* Verify existing pages in the mapping */
#define MPOL_MF_MOVE    (1<<1)    /* Move pages owned by this process to conform to mapping */
#define MPOL_MF_MOVE_ALL (1<<2)    /* Move every page to conform to mapping */
#define MPOL_MF_INTERNAL (1<<3)    /* Internal flags start here */
/*
 * Internal flags that share the struct mempolicy flags word with
 * "mode flags".  These flags are allocated from bit 0 up, as they
 * are never OR'ed into the mode in mempolicy API arguments.
 */
#define MPOL_F_SHARED  (1 << 0)    /* identify shared policies */
#define MPOL_F_LOCAL   (1 << 1)    /* preferred local allocation */

4.2、分配掩码（决定从哪个zone分配内存）

// 分配掩码
// 默认从Normal分配内存
// 详情参见：https://www.cnblogs.com/arnoldlu/p/8250734.html
// https://blog.csdn.net/farmwang/article/details/66975128
#define __GFP_DMA    ((__force gfp_t)0x01u)
#define __GFP_HIGHMEM    ((__force gfp_t)0x02u)
#define __GFP_DMA32    ((__force gfp_t)0x04u)
#define __GFP_MOVABLE    ((__force gfp_t)0x08u)
#define GFP_ZONEMASK    (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)

4.3、分配属性（决定分配过程中的行为）

// 操作类型掩码
// 详情参见：https://www.cnblogs.com/arnoldlu/p/8250734.html
#define __GFP_WAIT    ((__force gfp_t)0x10u)    /* Can wait and reschedule? */
#define __GFP_HIGH    ((__force gfp_t)0x20u)    /* Should access emergency pools? */
#define __GFP_IO    ((__force gfp_t)0x40u)    /* Can start physical IO? */
#define __GFP_FS    ((__force gfp_t)0x80u)    /* Can call down to low-level FS? */
#define __GFP_COLD    ((__force gfp_t)0x100u)    /* Cache-cold page required */
#define __GFP_NOWARN    ((__force gfp_t)0x200u)    /* Suppress page allocation failure warning */
#define __GFP_REPEAT    ((__force gfp_t)0x400u)    /* See above */
#define __GFP_NOFAIL    ((__force gfp_t)0x800u)    /* See above */
#define __GFP_NORETRY    ((__force gfp_t)0x1000u)/* See above */
#define __GFP_COMP    ((__force gfp_t)0x4000u)/* Add compound page metadata */
#define __GFP_ZERO    ((__force gfp_t)0x8000u)/* Return zeroed page on success */
#define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
#define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
#define __GFP_THISNODE    ((__force gfp_t)0x40000u)/* No fallback, no policies */
#define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */

4.4、内存水位相关标志位

// 内存分配水位相关标志
#define ALLOC_WMARK_MIN        WMARK_MIN
#define ALLOC_WMARK_LOW        WMARK_LOW
#define ALLOC_WMARK_HIGH    WMARK_HIGH
#define ALLOC_NO_WATERMARKS    0x04
#define ALLOC_WMARK_MASK    (ALLOC_NO_WATERMARKS-1)
// 详情参见：https://www.codeleading.com/article/59964929993
// 对于ALLOC_HARDER来说它把watermark水位减去了1/4
// 对于ALLOC_HIGH来说，它把保留水位值减少了1/2
// 这两个标记按照不同程度来减少保留内存数量，从而达到了更容易成功申请内存的目的
// 从这里也可以看出ALLOC_HIGH是比ALLOC_HARDER更加激进的内存申请方式。
#define ALLOC_HARDER        0x10
#define ALLOC_HIGH        0x20
#define ALLOC_CPUSET        0x40

学习笔记

concept

1、struct pglist_data（伙伴系统顶级结构体）

2、struct zone_struct（维护每个zone的内存）

3、struct page（每个物理页面的描述）

4、内存分配策略

4.1、分配策略（决定从哪个node分配）

4.2、分配掩码（决定从哪个zone分配内存）

4.3、分配属性（决定分配过程中的行为）

4.4、内存水位相关标志位