1、核心结构体
1.1、slab
struct slab {
struct list_head list; // slab对象本身链表,用在kmem_list3中free/full等链表中
unsigned long colouroff; // 颜色偏移(着色)
void *s_mem; // 指向第一个对象的地址,包括颜色偏移
unsigned int inuse; // slab中活跃的obj
kmem_bufctl_t free; // 本质上是一个空闲obj链表,用于描述下一个可用obj序号
unsigned short nodeid; // numa id
};
/*
* struct slab_rcu
*
* slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
* arrange for kmem_freepages to be called via RCU. This is useful if
* we need to approach a kernel structure obliquely, from its address
* obtained without the usual locking. We can lock the structure to
* stabilize it and check it's still at the given address, only if we
* can be sure that the memory has not been meanwhile reused for some
* other kind of object (which our subsystem's lock might corrupt).
*
* rcu_read_lock before reading the address, then rcu_read_unlock after
* taking the spinlock within the structure expected at that address.
*
* We assume struct slab_rcu can overlay struct slab when destroying.
*/
struct slab_rcu {
struct rcu_head head;
struct kmem_cache *cachep;
void *addr;
};
1.2、kmem_cache
struct kmem_cache {
struct array_cache *array[NR_CPUS]; // 每个CPU都有单独的缓存
unsigned int batchcount; // 要交换到缓存或者从换成移动到slab的obj数量
unsigned int limit; // 每个CPU缓存的obj的数量上限
unsigned int shared; // 是否需要共享。共享数组保存在kmem_list3中
unsigned int buffer_size; // 要缓存的obj的大小,会根据cache_line进行对齐
u32 reciprocal_buffer_size; // buffer_size倒数
unsigned int flags; // ??????
unsigned int num; // 每个slab有多少obj
unsigned int gfporder; // 用哪个order向伙伴系统申请page
gfp_t gfpflags; // 向伙伴系统申请page的flag
size_t colour; // ??????
unsigned int colour_off; // ??????
struct kmem_cache *slabp_cache; // 若slab处于page外,则用此kmem_cache分配slab
unsigned int slab_size; // slab的大小(slab描述符/free_list/对齐字节)
unsigned int dflags; // ??????
void (*ctor)(void *obj); // obj构造函数
const char *name; // kmem_cache名称
struct list_head next; // kmem_cache链表
struct kmem_list3 *nodelists[MAX_NUMNODES]; // 放到最后,会用实际node数目进行变量替换
};
1.3、kmem_list3
struct kmem_list3 {
spinlock_t list_lock; // lock
struct list_head slabs_partial; // 包含空闲的slab链表
struct list_head slabs_full; // 不包含空闲对象的slab链表
struct list_head slabs_free; // 还未使用的slab的链表
unsigned long free_objects; // 还有多少空闲的对象
unsigned int free_limit; // 最大包含多少数量的对象
unsigned int colour_next; // 下一个slab管理对象的偏移
struct array_cache *shared; // 同一个node的共享内存
struct array_cache **alien; // 其他node的共享内存
unsigned long next_reap; // 下一次要回收的时间
int free_touched; // ??????
};
2、核心全局静态变量
// array_cache及全局静态初始化变量
#define BOOT_CPUCACHE_ENTRIES 1
// 这个给cache_cache使用
static struct arraycache_init initarray_cache __initdata =
{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
// 这个给初始化generic kmem_cache使用,其实是给ac用
// 构造l3的时候,ac已经生成好了,可以直接kmalloc
static struct arraycache_init initarray_generic =
{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
struct arraycache_init {
struct array_cache cache;
void *entries[BOOT_CPUCACHE_ENTRIES];
};
struct array_cache {
unsigned int avail;
unsigned int limit;
unsigned int batchcount;
unsigned int touched;
spinlock_t lock;
// 这个地方是静态数组,因此需要封装一层,给此元素预留一个指针的地址
void *entry[];
};
// 初始化kmem_cache的时候使用的fake kmem_list3
#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
// 静态初始化的kmem_cache
// cache_cache用以生成kmem_cache结构体
static struct kmem_cache cache_cache = {
.batchcount = 1,
.limit = BOOT_CPUCACHE_ENTRIES,
.shared = 1,
.buffer_size = sizeof(struct kmem_cache),
.name = "kmem_cache",
};
3、slab系统初始化
void __init kmem_cache_init(void)
{
size_t left_over;
struct cache_sizes *sizes;
struct cache_names *names;
int i, order, node;
// 如果不包含其他numa节点,则不使用alien_caches
if (num_possible_nodes() == 1)
use_alien_caches = 0;
// 初始化 initkmem_list3
for (i = 0; i < NUM_INIT_LISTS; i++) {
// 把全局的initkmem_list3进行初始化
kmem_list3_init(&initkmem_list3[i]);
if (i < MAX_NUMNODES)
// 初始化静态kmem_cache
cache_cache.nodelists[i] = NULL;
}
// 初始化cache_cache
// 给nodelists赋值,将initkmem_list3前node_num个元素依次赋值给nodelists
// CACHE_CACHE为0
// initkmem_list3一共3 * MAX_NUMNODES条,其他2 * MAX_NUMNODES条会给
// ac/l3的kmem_cache使用
set_up_list3s(&cache_cache, CACHE_CACHE);
/* 全局变量slab_break_gfp_order为每个slab最多占用几个页面
,用来抑制碎片,比如大小为3360的对象
,如果其slab只占一个页面,碎片为736
,slab占用两个页面,则碎片大小也翻倍
。只有当对象很大
,以至于slab中连一个对象都放不下时
,才可以超过这个值
。有两个可能的取值
:当可用内存大于32MB时
,BREAK_GFP_ORDER_HI为1
,即每个slab最多占用2个页面
,只有当对象大小大于8192时
,才可以突破slab_break_gfp_order的限制
。小于等于32MB时BREAK_GFP_ORDER_LO为0。*/
if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
slab_break_gfp_order = BREAK_GFP_ORDER_HI;
// 获取当前node id
node = numa_node_id();
// 初始化cache_cache
// 1、添加到cache_chain链表里面
// 2、计算颜色偏移
// 3、填充array
// 4、设置kmem_list3
INIT_LIST_HEAD(&cache_chain);
list_add(&cache_cache.next, &cache_chain);
// 此处cache_line_size为64字节
cache_cache.colour_off = cache_line_size();
cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
// ??????
// 感觉有点多余,在set_up_list3s已经设置过了
cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
// 计算buffer_size及其倒数
// buffer_size要按照cache_line对齐
// nodelists放到最后就是方便根据实际node数量计算kmem_cache的大小
cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
nr_node_ids * sizeof(struct kmem_list3 *);
cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
cache_line_size());
cache_cache.reciprocal_buffer_size = reciprocal_value(cache_cache.buffer_size);
// 按照伙伴系统的页框order级别,找到第一个可以存放buffer_size的页框数目
// 并且计算此order级别可以存放多少个obj
for (order = 0; order < MAX_ORDER; order++) {
cache_estimate(order, cache_cache.buffer_size,
cache_line_size(), 0, &left_over, &cache_cache.num);
if (cache_cache.num)
break;
}
cache_cache.gfporder = order;
// left_over为slab_size - nr_objs * buffer_size - mgmt_size
cache_cache.colour = left_over / cache_cache.colour_off;
// 计算slab描述符+kmem_bufctl_t数组的大小
cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
sizeof(struct slab), cache_line_size());
// 参见头文件,创建各种级别的kmem_cache
sizes = malloc_sizes;
names = cache_names;
// 先创建array_cache和kmem_list3可以使用的通用内存
// order相同,则使用同一个
sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
sizes[INDEX_AC].cs_size,
ARCH_KMALLOC_MINALIGN,
ARCH_KMALLOC_FLAGS|SLAB_PANIC,
NULL);
if (INDEX_AC != INDEX_L3) {
sizes[INDEX_L3].cs_cachep =
kmem_cache_create(names[INDEX_L3].name,
sizes[INDEX_L3].cs_size,
ARCH_KMALLOC_MINALIGN,
ARCH_KMALLOC_FLAGS|SLAB_PANIC,
NULL);
}
slab_early_init = 0;
while (sizes->cs_size != ULONG_MAX) {
// 将sizes里面剩余的标准分别创建cache和dma cache
if (!sizes->cs_cachep) {
sizes->cs_cachep = kmem_cache_create(names->name,
sizes->cs_size,
ARCH_KMALLOC_MINALIGN,
ARCH_KMALLOC_FLAGS|SLAB_PANIC,
NULL);
}
sizes->cs_dmacachep = kmem_cache_create(
names->name_dma,
sizes->cs_size,
ARCH_KMALLOC_MINALIGN,
ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|SLAB_PANIC,
NULL);
sizes++;
names++;
}
{
struct array_cache *ptr;
// 分别替换cache_cache和ac的静态arraycache_init
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
memcpy(ptr, cpu_cache_get(&cache_cache),
sizeof(struct arraycache_init));
spin_lock_init(&ptr->lock);
cache_cache.array[smp_processor_id()] = ptr;
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
sizeof(struct arraycache_init));
spin_lock_init(&ptr->lock);
malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
ptr;
}
{
int nid;
// 替换所有node的kmem_list3
for_each_online_node(nid) {
init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
init_list(malloc_sizes[INDEX_AC].cs_cachep,
&initkmem_list3[SIZE_AC + nid], nid);
if (INDEX_AC != INDEX_L3) {
init_list(malloc_sizes[INDEX_L3].cs_cachep,
&initkmem_list3[SIZE_L3 + nid], nid);
}
}
}
g_cpucache_up = EARLY;
}
4、创建一个kmem_cache
struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *))
{
size_t left_over, slab_size, ralign;
struct kmem_cache *cachep = NULL, *pc;
gfp_t gfp;
// 名称为空/处于中断/对象小于4个字节/大于32个页面
if (!name || in_interrupt() || (size < BYTES_PER_WORD) || size > KMALLOC_MAX_SIZE) {
BUG();
}
if (slab_is_available()) {
get_online_cpus();
// lock
mutex_lock(&cache_chain_mutex);
}
// 此处删除了一个print log的代码,不影响主逻辑
// 检查对象的大小是不是32位对齐,如果不是则进行调整
if (size & (BYTES_PER_WORD - 1)) {
size += (BYTES_PER_WORD - 1);
size &= ~(BYTES_PER_WORD - 1);
}
// 检查Slab是不是按照硬件缓冲行对齐
// SLAB_HWCACHE_ALIGN:必须将对象按照硬件对齐
// 否则按照32对齐(4个字节)
if (flags & SLAB_HWCACHE_ALIGN) {
// 循环遍历,直到size > ralign / 2
ralign = cache_line_size();
while (size <= ralign / 2)
ralign /= 2;
} else {
ralign = BYTES_PER_WORD;
}
// 看文档注释
// SLAB_STORE_USER和SLAB_RED_ZONE用以debug
if (flags & SLAB_STORE_USER)
ralign = BYTES_PER_WORD;
if (flags & SLAB_RED_ZONE) {
ralign = REDZONE_ALIGN;
size += REDZONE_ALIGN - 1;
size &= ~(REDZONE_ALIGN - 1);
}
// ARCH_SLAB_MINALIGN为0,此处逻辑不会执行
// 不知道这个玩意有啥用
// ??????
if (ralign < ARCH_SLAB_MINALIGN) {
ralign = ARCH_SLAB_MINALIGN;
}
// 按照大的对齐
// 用户的需求覆盖实际需要对齐的大小
if (ralign < align) {
ralign = align;
}
// 如果ralign大于unsigned long long
// 关闭debug
if (ralign > __alignof__(unsigned long long))
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
// 覆盖对齐大小
align = ralign;
if (slab_is_available())
gfp = GFP_KERNEL;
else
gfp = GFP_NOWAIT;
// 用cache_cache分配一个kmem_cache
cachep = kmem_cache_zalloc(&cache_cache, gfp);
if (!cachep)
goto oops;
// 如果对象比较大,大于512字节并且不处于初始化阶段
// 则slab描述符要处于page之外
if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
!(flags & SLAB_NOLEAKTRACE))
flags |= CFLGS_OFF_SLAB;
// 计算对齐的size
size = ALIGN(size, align);
// 按照align对齐的obj_size/mgmt_size还能空余的空间
left_over = calculate_slab_order(cachep, size, align, flags);
// 这种情况出现应该是对象太大,导致分配不了
if (!cachep->num) {
printk(KERN_ERR
"kmem_cache_create: couldn't create cache %s.\n", name);
kmem_cache_free(&cache_cache, cachep);
cachep = NULL;
goto oops;
}
slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
+ sizeof(struct slab), align);
// 有可能对象超过limit,导致偏移大于slab_size
// 这种情况可以将slab分配到内部,节省空间
if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
flags &= ~CFLGS_OFF_SLAB;
left_over -= slab_size;
}
// 如果位于外面,则计算slab_size
// 无需对齐
if (flags & CFLGS_OFF_SLAB) {
slab_size = cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
/* If we're going to use the generic kernel_map_pages()
* poisoning, then it's going to smash the contents of
* the redzone and userword anyhow, so switch them off.
*/
if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
}
cachep->colour_off = cache_line_size();
if (cachep->colour_off < align)
cachep->colour_off = align;
// 可以偏移几次
cachep->colour = left_over / cachep->colour_off;
cachep->slab_size = slab_size;
cachep->flags = flags;
cachep->gfpflags = 0;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
cachep->gfpflags |= GFP_DMA;
cachep->buffer_size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
if (flags & CFLGS_OFF_SLAB) {
cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
/*
* This is a possibility for one of the malloc_sizes caches.
* But since we go off slab only for object size greater than
* PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
* this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
}
cachep->ctor = ctor;
cachep->name = name;
if (setup_cpu_cache(cachep, gfp)) {
__kmem_cache_destroy(cachep);
cachep = NULL;
goto oops;
}
list_add(&cachep->next, &cache_chain);
oops:
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
if (slab_is_available()) {
mutex_unlock(&cache_chain_mutex);
put_online_cpus();
}
return cachep;
}
5、alloc相关API
5.1、wrapper API
// 基于cache_cache分配一个kmem_cache
// 头文件有kmem_cache_zalloc封装kmem_cache_alloc
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
return __cache_alloc(cachep, flags, __builtin_return_address(0));
}
// 从给定node分配一个obj
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
return __cache_alloc_node(cachep, flags, nodeid, __builtin_return_address(0));
}
// 对外暴露的malloc
void *__kmalloc(size_t size, gfp_t flags)
{
return __do_kmalloc(size, flags, NULL);
}
// 核心分配内存API
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, void *caller)
{
struct kmem_cache *cachep;
void *ret;
// 寻找符合条件的cachep
cachep = __find_general_cachep(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
// 分配一个obj
ret = __cache_alloc(cachep, flags, caller);
return ret;
}
5.2、core API
// 从给定cachep获取一个obj
// 附带很多检查,不影响主逻辑,已删除
static __always_inline void *__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
{
void *objp;
flags &= gfp_allowed_mask;
// 内存申请故障注入,忽略
if (slab_should_failslab(cachep, flags))
return NULL;
objp = __do_cache_alloc(cachep, flags);
prefetchw(objp);
if (unlikely((flags & __GFP_ZERO) && objp))
memset(objp, 0, obj_size(cachep));
return objp;
}
static __always_inline void *__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
{
void *objp;
if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
objp = alternate_node_alloc(cache, flags);
if (objp)
goto out;
}
// 先从本node分配内存
objp = ____cache_alloc(cache, flags);
// 再从其他node分配内存
if (!objp)
objp = ____cache_alloc_node(cache, flags, numa_node_id());
out:
return objp;
}
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
check_irq_off();
// 优先从ac分配数据
// ac数据没有,则进行fill
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
ac->touched = 1;
objp = ac->entry[--ac->avail];
} else {
objp = cache_alloc_refill(cachep, flags);
}
return objp;
}
static __always_inline void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, void *caller)
{
void *ptr;
flags &= gfp_allowed_mask;
if (nodeid == -1)
nodeid = numa_node_id();
if (unlikely(!cachep->nodelists[nodeid])) {
ptr = fallback_alloc(cachep, flags);
goto out;
}
if (nodeid == numa_node_id()) {
/*
* Use the locally cached objects if possible.
* However ____cache_alloc does not allow fallback
* to other nodes. It may fail while we still have
* objects on other nodes available.
*/
ptr = ____cache_alloc(cachep, flags);
if (ptr)
goto out;
}
ptr = ____cache_alloc_node(cachep, flags, nodeid);
out:
if (unlikely((flags & __GFP_ZERO) && ptr))
memset(ptr, 0, obj_size(cachep));
return ptr;
}
static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
struct list_head *entry;
struct slab *slabp;
struct kmem_list3 *l3;
void *obj;
int x;
l3 = cachep->nodelists[nodeid];
retry:
spin_lock(&l3->list_lock);
entry = l3->slabs_partial.next;
if (entry == &l3->slabs_partial) {
l3->free_touched = 1;
entry = l3->slabs_free.next;
if (entry == &l3->slabs_free)
goto must_grow;
}
slabp = list_entry(entry, struct slab, list);
check_spinlock_acquired_node(cachep, nodeid);
check_slabp(cachep, slabp);
obj = slab_get_obj(cachep, slabp, nodeid);
check_slabp(cachep, slabp);
l3->free_objects--;
list_del(&slabp->list);
if (slabp->free == BUFCTL_END)
list_add(&slabp->list, &l3->slabs_full);
else
list_add(&slabp->list, &l3->slabs_partial);
spin_unlock(&l3->list_lock);
goto done;
must_grow:
spin_unlock(&l3->list_lock);
x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
if (x)
goto retry;
return fallback_alloc(cachep, flags);
done:
return obj;
}
/*
* Fallback function if there was no memory available and no objects on a
* certain node and fall back is permitted. First we scan all the
* available nodelists for available objects. If that fails then we
* perform an allocation without specifying a node. This allows the page
* allocator to do its reclaim / fallback magic. We then insert the
* slab into the proper nodelist and then allocate from it.
*/
static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
{
struct zonelist *zonelist;
gfp_t local_flags;
struct zoneref *z;
struct zone *zone;
enum zone_type high_zoneidx = gfp_zone(flags);
void *obj = NULL;
int nid;
if (flags & __GFP_THISNODE)
return NULL;
zonelist = node_zonelist(slab_node(current->mempolicy), flags);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
retry:
/*
* Look through allowed nodes for objects available
* from existing per node queues.
*/
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
nid = zone_to_nid(zone);
if (cpuset_zone_allowed_hardwall(zone, flags) &&
cache->nodelists[nid] &&
cache->nodelists[nid]->free_objects) {
obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid);
if (obj)
break;
}
}
if (!obj) {
/*
* This allocation will be performed within the constraints
* of the current cpuset / memory policy requirements.
* We may trigger various forms of reclaim on the allowed
* set and go into memory reserves if necessary.
*/
obj = kmem_getpages(cache, local_flags, numa_node_id());
if (obj) {
nid = page_to_nid(virt_to_page(obj));
if (cache_grow(cache, flags, nid, obj)) {
obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid);
if (!obj)
/*
* Another processor may allocate the
* objects in the slab since we are
* not holding any locks.
*/
goto retry;
} else {
/* cache_grow already freed obj */
obj = NULL;
}
}
}
return obj;
}