1、核心结构体
1.1、slab
struct slab { struct list_head list; // slab对象本身链表,用在kmem_list3中free/full等链表中 unsigned long colouroff; // 颜色偏移(着色) void *s_mem; // 指向第一个对象的地址,包括颜色偏移 unsigned int inuse; // slab中活跃的obj kmem_bufctl_t free; // 本质上是一个空闲obj链表,用于描述下一个可用obj序号 unsigned short nodeid; // numa id};/* * struct slab_rcu * * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to * arrange for kmem_freepages to be called via RCU. This is useful if * we need to approach a kernel structure obliquely, from its address * obtained without the usual locking. We can lock the structure to * stabilize it and check it's still at the given address, only if we * can be sure that the memory has not been meanwhile reused for some * other kind of object (which our subsystem's lock might corrupt). * * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. * * We assume struct slab_rcu can overlay struct slab when destroying. */struct slab_rcu { struct rcu_head head; struct kmem_cache *cachep; void *addr;};
1.2、kmem_cache
struct kmem_cache { struct array_cache *array[NR_CPUS]; // 每个CPU都有单独的缓存 unsigned int batchcount; // 要交换到缓存或者从换成移动到slab的obj数量 unsigned int limit; // 每个CPU缓存的obj的数量上限 unsigned int shared; // 是否需要共享。共享数组保存在kmem_list3中 unsigned int buffer_size; // 要缓存的obj的大小,会根据cache_line进行对齐 u32 reciprocal_buffer_size; // buffer_size倒数 unsigned int flags; // ?????? unsigned int num; // 每个slab有多少obj unsigned int gfporder; // 用哪个order向伙伴系统申请page gfp_t gfpflags; // 向伙伴系统申请page的flag size_t colour; // ?????? unsigned int colour_off; // ?????? struct kmem_cache *slabp_cache; // 若slab处于page外,则用此kmem_cache分配slab unsigned int slab_size; // slab的大小(slab描述符/free_list/对齐字节) unsigned int dflags; // ?????? void (*ctor)(void *obj); // obj构造函数 const char *name; // kmem_cache名称 struct list_head next; // kmem_cache链表 struct kmem_list3 *nodelists[MAX_NUMNODES]; // 放到最后,会用实际node数目进行变量替换};
1.3、kmem_list3
struct kmem_list3 { spinlock_t list_lock; // lock struct list_head slabs_partial; // 包含空闲的slab链表 struct list_head slabs_full; // 不包含空闲对象的slab链表 struct list_head slabs_free; // 还未使用的slab的链表 unsigned long free_objects; // 还有多少空闲的对象 unsigned int free_limit; // 最大包含多少数量的对象 unsigned int colour_next; // 下一个slab管理对象的偏移 struct array_cache *shared; // 同一个node的共享内存 struct array_cache **alien; // 其他node的共享内存 unsigned long next_reap; // 下一次要回收的时间 int free_touched; // ??????};
2、核心全局静态变量
// array_cache及全局静态初始化变量#define BOOT_CPUCACHE_ENTRIES 1// 这个给cache_cache使用static struct arraycache_init initarray_cache __initdata = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };// 这个给初始化generic kmem_cache使用,其实是给ac用// 构造l3的时候,ac已经生成好了,可以直接kmallocstatic struct arraycache_init initarray_generic = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };struct arraycache_init { struct array_cache cache; void *entries[BOOT_CPUCACHE_ENTRIES];};struct array_cache { unsigned int avail; unsigned int limit; unsigned int batchcount; unsigned int touched; spinlock_t lock; // 这个地方是静态数组,因此需要封装一层,给此元素预留一个指针的地址 void *entry[];};// 初始化kmem_cache的时候使用的fake kmem_list3#define NUM_INIT_LISTS (3 * MAX_NUMNODES)struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];// 静态初始化的kmem_cache// cache_cache用以生成kmem_cache结构体static struct kmem_cache cache_cache = { .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, .buffer_size = sizeof(struct kmem_cache), .name = "kmem_cache",};
3、slab系统初始化
void __init kmem_cache_init(void){ size_t left_over; struct cache_sizes *sizes; struct cache_names *names; int i, order, node; // 如果不包含其他numa节点,则不使用alien_caches if (num_possible_nodes() == 1) use_alien_caches = 0; // 初始化 initkmem_list3 for (i = 0; i < NUM_INIT_LISTS; i++) { // 把全局的initkmem_list3进行初始化 kmem_list3_init(&initkmem_list3[i]); if (i < MAX_NUMNODES) // 初始化静态kmem_cache cache_cache.nodelists[i] = NULL; } // 初始化cache_cache // 给nodelists赋值,将initkmem_list3前node_num个元素依次赋值给nodelists // CACHE_CACHE为0 // initkmem_list3一共3 * MAX_NUMNODES条,其他2 * MAX_NUMNODES条会给 // ac/l3的kmem_cache使用 set_up_list3s(&cache_cache, CACHE_CACHE); /* 全局变量slab_break_gfp_order为每个slab最多占用几个页面 ,用来抑制碎片,比如大小为3360的对象 ,如果其slab只占一个页面,碎片为736 ,slab占用两个页面,则碎片大小也翻倍 。只有当对象很大 ,以至于slab中连一个对象都放不下时 ,才可以超过这个值 。有两个可能的取值 :当可用内存大于32MB时 ,BREAK_GFP_ORDER_HI为1 ,即每个slab最多占用2个页面 ,只有当对象大小大于8192时 ,才可以突破slab_break_gfp_order的限制 。小于等于32MB时BREAK_GFP_ORDER_LO为0。*/ if (totalram_pages > (32 << 20) >> PAGE_SHIFT) slab_break_gfp_order = BREAK_GFP_ORDER_HI; // 获取当前node id node = numa_node_id(); // 初始化cache_cache // 1、添加到cache_chain链表里面 // 2、计算颜色偏移 // 3、填充array // 4、设置kmem_list3 INIT_LIST_HEAD(&cache_chain); list_add(&cache_cache.next, &cache_chain); // 此处cache_line_size为64字节 cache_cache.colour_off = cache_line_size(); cache_cache.array[smp_processor_id()] = &initarray_cache.cache; // ?????? // 感觉有点多余,在set_up_list3s已经设置过了 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; // 计算buffer_size及其倒数 // buffer_size要按照cache_line对齐 // nodelists放到最后就是方便根据实际node数量计算kmem_cache的大小 cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + nr_node_ids * sizeof(struct kmem_list3 *); cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); cache_cache.reciprocal_buffer_size = reciprocal_value(cache_cache.buffer_size); // 按照伙伴系统的页框order级别,找到第一个可以存放buffer_size的页框数目 // 并且计算此order级别可以存放多少个obj for (order = 0; order < MAX_ORDER; order++) { cache_estimate(order, cache_cache.buffer_size, cache_line_size(), 0, &left_over, &cache_cache.num); if (cache_cache.num) break; } cache_cache.gfporder = order; // left_over为slab_size - nr_objs * buffer_size - mgmt_size cache_cache.colour = left_over / cache_cache.colour_off; // 计算slab描述符+kmem_bufctl_t数组的大小 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + sizeof(struct slab), cache_line_size()); // 参见头文件,创建各种级别的kmem_cache sizes = malloc_sizes; names = cache_names; // 先创建array_cache和kmem_list3可以使用的通用内存 // order相同,则使用同一个 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, NULL); if (INDEX_AC != INDEX_L3) { sizes[INDEX_L3].cs_cachep = kmem_cache_create(names[INDEX_L3].name, sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, NULL); } slab_early_init = 0; while (sizes->cs_size != ULONG_MAX) { // 将sizes里面剩余的标准分别创建cache和dma cache if (!sizes->cs_cachep) { sizes->cs_cachep = kmem_cache_create(names->name, sizes->cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, NULL); } sizes->cs_dmacachep = kmem_cache_create( names->name_dma, sizes->cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|SLAB_PANIC, NULL); sizes++; names++; } { struct array_cache *ptr; // 分别替换cache_cache和ac的静态arraycache_init ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); memcpy(ptr, cpu_cache_get(&cache_cache), sizeof(struct arraycache_init)); spin_lock_init(&ptr->lock); cache_cache.array[smp_processor_id()] = ptr; ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), sizeof(struct arraycache_init)); spin_lock_init(&ptr->lock); malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = ptr; } { int nid; // 替换所有node的kmem_list3 for_each_online_node(nid) { init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid); init_list(malloc_sizes[INDEX_AC].cs_cachep, &initkmem_list3[SIZE_AC + nid], nid); if (INDEX_AC != INDEX_L3) { init_list(malloc_sizes[INDEX_L3].cs_cachep, &initkmem_list3[SIZE_L3 + nid], nid); } } } g_cpucache_up = EARLY;}
4、创建一个kmem_cache
struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)){ size_t left_over, slab_size, ralign; struct kmem_cache *cachep = NULL, *pc; gfp_t gfp; // 名称为空/处于中断/对象小于4个字节/大于32个页面 if (!name || in_interrupt() || (size < BYTES_PER_WORD) || size > KMALLOC_MAX_SIZE) { BUG(); } if (slab_is_available()) { get_online_cpus(); // lock mutex_lock(&cache_chain_mutex); } // 此处删除了一个print log的代码,不影响主逻辑 // 检查对象的大小是不是32位对齐,如果不是则进行调整 if (size & (BYTES_PER_WORD - 1)) { size += (BYTES_PER_WORD - 1); size &= ~(BYTES_PER_WORD - 1); } // 检查Slab是不是按照硬件缓冲行对齐 // SLAB_HWCACHE_ALIGN:必须将对象按照硬件对齐 // 否则按照32对齐(4个字节) if (flags & SLAB_HWCACHE_ALIGN) { // 循环遍历,直到size > ralign / 2 ralign = cache_line_size(); while (size <= ralign / 2) ralign /= 2; } else { ralign = BYTES_PER_WORD; } // 看文档注释 // SLAB_STORE_USER和SLAB_RED_ZONE用以debug if (flags & SLAB_STORE_USER) ralign = BYTES_PER_WORD; if (flags & SLAB_RED_ZONE) { ralign = REDZONE_ALIGN; size += REDZONE_ALIGN - 1; size &= ~(REDZONE_ALIGN - 1); } // ARCH_SLAB_MINALIGN为0,此处逻辑不会执行 // 不知道这个玩意有啥用 // ?????? if (ralign < ARCH_SLAB_MINALIGN) { ralign = ARCH_SLAB_MINALIGN; } // 按照大的对齐 // 用户的需求覆盖实际需要对齐的大小 if (ralign < align) { ralign = align; } // 如果ralign大于unsigned long long // 关闭debug if (ralign > __alignof__(unsigned long long)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); // 覆盖对齐大小 align = ralign; if (slab_is_available()) gfp = GFP_KERNEL; else gfp = GFP_NOWAIT; // 用cache_cache分配一个kmem_cache cachep = kmem_cache_zalloc(&cache_cache, gfp); if (!cachep) goto oops; // 如果对象比较大,大于512字节并且不处于初始化阶段 // 则slab描述符要处于page之外 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && !(flags & SLAB_NOLEAKTRACE)) flags |= CFLGS_OFF_SLAB; // 计算对齐的size size = ALIGN(size, align); // 按照align对齐的obj_size/mgmt_size还能空余的空间 left_over = calculate_slab_order(cachep, size, align, flags); // 这种情况出现应该是对象太大,导致分配不了 if (!cachep->num) { printk(KERN_ERR "kmem_cache_create: couldn't create cache %s.\n", name); kmem_cache_free(&cache_cache, cachep); cachep = NULL; goto oops; } slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab), align); // 有可能对象超过limit,导致偏移大于slab_size // 这种情况可以将slab分配到内部,节省空间 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { flags &= ~CFLGS_OFF_SLAB; left_over -= slab_size; } // 如果位于外面,则计算slab_size // 无需对齐 if (flags & CFLGS_OFF_SLAB) { slab_size = cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); /* If we're going to use the generic kernel_map_pages() * poisoning, then it's going to smash the contents of * the redzone and userword anyhow, so switch them off. */ if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); } cachep->colour_off = cache_line_size(); if (cachep->colour_off < align) cachep->colour_off = align; // 可以偏移几次 cachep->colour = left_over / cachep->colour_off; cachep->slab_size = slab_size; cachep->flags = flags; cachep->gfpflags = 0; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) cachep->gfpflags |= GFP_DMA; cachep->buffer_size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); /* * This is a possibility for one of the malloc_sizes caches. * But since we go off slab only for object size greater than * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, * this should not happen at all. * But leave a BUG_ON for some lucky dude. */ BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); } cachep->ctor = ctor; cachep->name = name; if (setup_cpu_cache(cachep, gfp)) { __kmem_cache_destroy(cachep); cachep = NULL; goto oops; } list_add(&cachep->next, &cache_chain);oops: if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", name); if (slab_is_available()) { mutex_unlock(&cache_chain_mutex); put_online_cpus(); } return cachep;}
5、alloc相关API
5.1、wrapper API
// 基于cache_cache分配一个kmem_cache// 头文件有kmem_cache_zalloc封装kmem_cache_allocvoid *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags){ return __cache_alloc(cachep, flags, __builtin_return_address(0));}// 从给定node分配一个objvoid *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid){ return __cache_alloc_node(cachep, flags, nodeid, __builtin_return_address(0));}// 对外暴露的mallocvoid *__kmalloc(size_t size, gfp_t flags){ return __do_kmalloc(size, flags, NULL);}// 核心分配内存APIstatic __always_inline void *__do_kmalloc(size_t size, gfp_t flags, void *caller){ struct kmem_cache *cachep; void *ret; // 寻找符合条件的cachep cachep = __find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; // 分配一个obj ret = __cache_alloc(cachep, flags, caller); return ret;}
5.2、core API
// 从给定cachep获取一个obj// 附带很多检查,不影响主逻辑,已删除static __always_inline void *__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller){ void *objp; flags &= gfp_allowed_mask; // 内存申请故障注入,忽略 if (slab_should_failslab(cachep, flags)) return NULL; objp = __do_cache_alloc(cachep, flags); prefetchw(objp); if (unlikely((flags & __GFP_ZERO) && objp)) memset(objp, 0, obj_size(cachep)); return objp;}static __always_inline void *__do_cache_alloc(struct kmem_cache *cache, gfp_t flags){ void *objp; if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { objp = alternate_node_alloc(cache, flags); if (objp) goto out; } // 先从本node分配内存 objp = ____cache_alloc(cache, flags); // 再从其他node分配内存 if (!objp) objp = ____cache_alloc_node(cache, flags, numa_node_id()); out: return objp;}static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags){ void *objp; struct array_cache *ac; check_irq_off(); // 优先从ac分配数据 // ac数据没有,则进行fill ac = cpu_cache_get(cachep); if (likely(ac->avail)) { ac->touched = 1; objp = ac->entry[--ac->avail]; } else { objp = cache_alloc_refill(cachep, flags); } return objp;}static __always_inline void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, void *caller){ void *ptr; flags &= gfp_allowed_mask; if (nodeid == -1) nodeid = numa_node_id(); if (unlikely(!cachep->nodelists[nodeid])) { ptr = fallback_alloc(cachep, flags); goto out; } if (nodeid == numa_node_id()) { /* * Use the locally cached objects if possible. * However ____cache_alloc does not allow fallback * to other nodes. It may fail while we still have * objects on other nodes available. */ ptr = ____cache_alloc(cachep, flags); if (ptr) goto out; } ptr = ____cache_alloc_node(cachep, flags, nodeid);out: if (unlikely((flags & __GFP_ZERO) && ptr)) memset(ptr, 0, obj_size(cachep)); return ptr;}static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid){ struct list_head *entry; struct slab *slabp; struct kmem_list3 *l3; void *obj; int x; l3 = cachep->nodelists[nodeid];retry: spin_lock(&l3->list_lock); entry = l3->slabs_partial.next; if (entry == &l3->slabs_partial) { l3->free_touched = 1; entry = l3->slabs_free.next; if (entry == &l3->slabs_free) goto must_grow; } slabp = list_entry(entry, struct slab, list); check_spinlock_acquired_node(cachep, nodeid); check_slabp(cachep, slabp); obj = slab_get_obj(cachep, slabp, nodeid); check_slabp(cachep, slabp); l3->free_objects--; list_del(&slabp->list); if (slabp->free == BUFCTL_END) list_add(&slabp->list, &l3->slabs_full); else list_add(&slabp->list, &l3->slabs_partial); spin_unlock(&l3->list_lock); goto done;must_grow: spin_unlock(&l3->list_lock); x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); if (x) goto retry; return fallback_alloc(cachep, flags);done: return obj;}/* * Fallback function if there was no memory available and no objects on a * certain node and fall back is permitted. First we scan all the * available nodelists for available objects. If that fails then we * perform an allocation without specifying a node. This allows the page * allocator to do its reclaim / fallback magic. We then insert the * slab into the proper nodelist and then allocate from it. */static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags){ struct zonelist *zonelist; gfp_t local_flags; struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); void *obj = NULL; int nid; if (flags & __GFP_THISNODE) return NULL; zonelist = node_zonelist(slab_node(current->mempolicy), flags); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);retry: /* * Look through allowed nodes for objects available * from existing per node queues. */ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { nid = zone_to_nid(zone); if (cpuset_zone_allowed_hardwall(zone, flags) && cache->nodelists[nid] && cache->nodelists[nid]->free_objects) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); if (obj) break; } } if (!obj) { /* * This allocation will be performed within the constraints * of the current cpuset / memory policy requirements. * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ obj = kmem_getpages(cache, local_flags, numa_node_id()); if (obj) { nid = page_to_nid(virt_to_page(obj)); if (cache_grow(cache, flags, nid, obj)) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); if (!obj) /* * Another processor may allocate the * objects in the slab since we are * not holding any locks. */ goto retry; } else { /* cache_grow already freed obj */ obj = NULL; } } } return obj;}