1、核心结构体

1.1、slab

  1. struct slab {
  2. struct list_head list; // slab对象本身链表,用在kmem_list3中free/full等链表中
  3. unsigned long colouroff; // 颜色偏移(着色)
  4. void *s_mem; // 指向第一个对象的地址,包括颜色偏移
  5. unsigned int inuse; // slab中活跃的obj
  6. kmem_bufctl_t free; // 本质上是一个空闲obj链表,用于描述下一个可用obj序号
  7. unsigned short nodeid; // numa id
  8. };
  9. /*
  10. * struct slab_rcu
  11. *
  12. * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
  13. * arrange for kmem_freepages to be called via RCU. This is useful if
  14. * we need to approach a kernel structure obliquely, from its address
  15. * obtained without the usual locking. We can lock the structure to
  16. * stabilize it and check it's still at the given address, only if we
  17. * can be sure that the memory has not been meanwhile reused for some
  18. * other kind of object (which our subsystem's lock might corrupt).
  19. *
  20. * rcu_read_lock before reading the address, then rcu_read_unlock after
  21. * taking the spinlock within the structure expected at that address.
  22. *
  23. * We assume struct slab_rcu can overlay struct slab when destroying.
  24. */
  25. struct slab_rcu {
  26. struct rcu_head head;
  27. struct kmem_cache *cachep;
  28. void *addr;
  29. };

1.2、kmem_cache

  1. struct kmem_cache {
  2. struct array_cache *array[NR_CPUS]; // 每个CPU都有单独的缓存
  3. unsigned int batchcount; // 要交换到缓存或者从换成移动到slab的obj数量
  4. unsigned int limit; // 每个CPU缓存的obj的数量上限
  5. unsigned int shared; // 是否需要共享。共享数组保存在kmem_list3中
  6. unsigned int buffer_size; // 要缓存的obj的大小,会根据cache_line进行对齐
  7. u32 reciprocal_buffer_size; // buffer_size倒数
  8. unsigned int flags; // ??????
  9. unsigned int num; // 每个slab有多少obj
  10. unsigned int gfporder; // 用哪个order向伙伴系统申请page
  11. gfp_t gfpflags; // 向伙伴系统申请page的flag
  12. size_t colour; // ??????
  13. unsigned int colour_off; // ??????
  14. struct kmem_cache *slabp_cache; // 若slab处于page外,则用此kmem_cache分配slab
  15. unsigned int slab_size; // slab的大小(slab描述符/free_list/对齐字节)
  16. unsigned int dflags; // ??????
  17. void (*ctor)(void *obj); // obj构造函数
  18. const char *name; // kmem_cache名称
  19. struct list_head next; // kmem_cache链表
  20. struct kmem_list3 *nodelists[MAX_NUMNODES]; // 放到最后,会用实际node数目进行变量替换
  21. };

1.3、kmem_list3

  1. struct kmem_list3 {
  2. spinlock_t list_lock; // lock
  3. struct list_head slabs_partial; // 包含空闲的slab链表
  4. struct list_head slabs_full; // 不包含空闲对象的slab链表
  5. struct list_head slabs_free; // 还未使用的slab的链表
  6. unsigned long free_objects; // 还有多少空闲的对象
  7. unsigned int free_limit; // 最大包含多少数量的对象
  8. unsigned int colour_next; // 下一个slab管理对象的偏移
  9. struct array_cache *shared; // 同一个node的共享内存
  10. struct array_cache **alien; // 其他node的共享内存
  11. unsigned long next_reap; // 下一次要回收的时间
  12. int free_touched; // ??????
  13. };

2、核心全局静态变量

  1. // array_cache及全局静态初始化变量
  2. #define BOOT_CPUCACHE_ENTRIES 1
  3. // 这个给cache_cache使用
  4. static struct arraycache_init initarray_cache __initdata =
  5. { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
  6. // 这个给初始化generic kmem_cache使用,其实是给ac用
  7. // 构造l3的时候,ac已经生成好了,可以直接kmalloc
  8. static struct arraycache_init initarray_generic =
  9. { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
  10. struct arraycache_init {
  11. struct array_cache cache;
  12. void *entries[BOOT_CPUCACHE_ENTRIES];
  13. };
  14. struct array_cache {
  15. unsigned int avail;
  16. unsigned int limit;
  17. unsigned int batchcount;
  18. unsigned int touched;
  19. spinlock_t lock;
  20. // 这个地方是静态数组,因此需要封装一层,给此元素预留一个指针的地址
  21. void *entry[];
  22. };
  23. // 初始化kmem_cache的时候使用的fake kmem_list3
  24. #define NUM_INIT_LISTS (3 * MAX_NUMNODES)
  25. struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
  26. // 静态初始化的kmem_cache
  27. // cache_cache用以生成kmem_cache结构体
  28. static struct kmem_cache cache_cache = {
  29. .batchcount = 1,
  30. .limit = BOOT_CPUCACHE_ENTRIES,
  31. .shared = 1,
  32. .buffer_size = sizeof(struct kmem_cache),
  33. .name = "kmem_cache",
  34. };

3、slab系统初始化

  1. void __init kmem_cache_init(void)
  2. {
  3. size_t left_over;
  4. struct cache_sizes *sizes;
  5. struct cache_names *names;
  6. int i, order, node;
  7. // 如果不包含其他numa节点,则不使用alien_caches
  8. if (num_possible_nodes() == 1)
  9. use_alien_caches = 0;
  10. // 初始化 initkmem_list3
  11. for (i = 0; i < NUM_INIT_LISTS; i++) {
  12. // 把全局的initkmem_list3进行初始化
  13. kmem_list3_init(&initkmem_list3[i]);
  14. if (i < MAX_NUMNODES)
  15. // 初始化静态kmem_cache
  16. cache_cache.nodelists[i] = NULL;
  17. }
  18. // 初始化cache_cache
  19. // 给nodelists赋值,将initkmem_list3前node_num个元素依次赋值给nodelists
  20. // CACHE_CACHE为0
  21. // initkmem_list3一共3 * MAX_NUMNODES条,其他2 * MAX_NUMNODES条会给
  22. // ac/l3的kmem_cache使用
  23. set_up_list3s(&cache_cache, CACHE_CACHE);
  24. /* 全局变量slab_break_gfp_order为每个slab最多占用几个页面
  25. ,用来抑制碎片,比如大小为3360的对象
  26. ,如果其slab只占一个页面,碎片为736
  27. ,slab占用两个页面,则碎片大小也翻倍
  28. 。只有当对象很大
  29. ,以至于slab中连一个对象都放不下时
  30. ,才可以超过这个值
  31. 。有两个可能的取值
  32. :当可用内存大于32MB时
  33. ,BREAK_GFP_ORDER_HI为1
  34. ,即每个slab最多占用2个页面
  35. ,只有当对象大小大于8192时
  36. ,才可以突破slab_break_gfp_order的限制
  37. 。小于等于32MB时BREAK_GFP_ORDER_LO为0。*/
  38. if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
  39. slab_break_gfp_order = BREAK_GFP_ORDER_HI;
  40. // 获取当前node id
  41. node = numa_node_id();
  42. // 初始化cache_cache
  43. // 1、添加到cache_chain链表里面
  44. // 2、计算颜色偏移
  45. // 3、填充array
  46. // 4、设置kmem_list3
  47. INIT_LIST_HEAD(&cache_chain);
  48. list_add(&cache_cache.next, &cache_chain);
  49. // 此处cache_line_size为64字节
  50. cache_cache.colour_off = cache_line_size();
  51. cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
  52. // ??????
  53. // 感觉有点多余,在set_up_list3s已经设置过了
  54. cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
  55. // 计算buffer_size及其倒数
  56. // buffer_size要按照cache_line对齐
  57. // nodelists放到最后就是方便根据实际node数量计算kmem_cache的大小
  58. cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
  59. nr_node_ids * sizeof(struct kmem_list3 *);
  60. cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
  61. cache_line_size());
  62. cache_cache.reciprocal_buffer_size = reciprocal_value(cache_cache.buffer_size);
  63. // 按照伙伴系统的页框order级别,找到第一个可以存放buffer_size的页框数目
  64. // 并且计算此order级别可以存放多少个obj
  65. for (order = 0; order < MAX_ORDER; order++) {
  66. cache_estimate(order, cache_cache.buffer_size,
  67. cache_line_size(), 0, &left_over, &cache_cache.num);
  68. if (cache_cache.num)
  69. break;
  70. }
  71. cache_cache.gfporder = order;
  72. // left_over为slab_size - nr_objs * buffer_size - mgmt_size
  73. cache_cache.colour = left_over / cache_cache.colour_off;
  74. // 计算slab描述符+kmem_bufctl_t数组的大小
  75. cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
  76. sizeof(struct slab), cache_line_size());
  77. // 参见头文件,创建各种级别的kmem_cache
  78. sizes = malloc_sizes;
  79. names = cache_names;
  80. // 先创建array_cache和kmem_list3可以使用的通用内存
  81. // order相同,则使用同一个
  82. sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
  83. sizes[INDEX_AC].cs_size,
  84. ARCH_KMALLOC_MINALIGN,
  85. ARCH_KMALLOC_FLAGS|SLAB_PANIC,
  86. NULL);
  87. if (INDEX_AC != INDEX_L3) {
  88. sizes[INDEX_L3].cs_cachep =
  89. kmem_cache_create(names[INDEX_L3].name,
  90. sizes[INDEX_L3].cs_size,
  91. ARCH_KMALLOC_MINALIGN,
  92. ARCH_KMALLOC_FLAGS|SLAB_PANIC,
  93. NULL);
  94. }
  95. slab_early_init = 0;
  96. while (sizes->cs_size != ULONG_MAX) {
  97. // 将sizes里面剩余的标准分别创建cache和dma cache
  98. if (!sizes->cs_cachep) {
  99. sizes->cs_cachep = kmem_cache_create(names->name,
  100. sizes->cs_size,
  101. ARCH_KMALLOC_MINALIGN,
  102. ARCH_KMALLOC_FLAGS|SLAB_PANIC,
  103. NULL);
  104. }
  105. sizes->cs_dmacachep = kmem_cache_create(
  106. names->name_dma,
  107. sizes->cs_size,
  108. ARCH_KMALLOC_MINALIGN,
  109. ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|SLAB_PANIC,
  110. NULL);
  111. sizes++;
  112. names++;
  113. }
  114. {
  115. struct array_cache *ptr;
  116. // 分别替换cache_cache和ac的静态arraycache_init
  117. ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
  118. memcpy(ptr, cpu_cache_get(&cache_cache),
  119. sizeof(struct arraycache_init));
  120. spin_lock_init(&ptr->lock);
  121. cache_cache.array[smp_processor_id()] = ptr;
  122. ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
  123. memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
  124. sizeof(struct arraycache_init));
  125. spin_lock_init(&ptr->lock);
  126. malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
  127. ptr;
  128. }
  129. {
  130. int nid;
  131. // 替换所有node的kmem_list3
  132. for_each_online_node(nid) {
  133. init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
  134. init_list(malloc_sizes[INDEX_AC].cs_cachep,
  135. &initkmem_list3[SIZE_AC + nid], nid);
  136. if (INDEX_AC != INDEX_L3) {
  137. init_list(malloc_sizes[INDEX_L3].cs_cachep,
  138. &initkmem_list3[SIZE_L3 + nid], nid);
  139. }
  140. }
  141. }
  142. g_cpucache_up = EARLY;
  143. }

4、创建一个kmem_cache

  1. struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *))
  2. {
  3. size_t left_over, slab_size, ralign;
  4. struct kmem_cache *cachep = NULL, *pc;
  5. gfp_t gfp;
  6. // 名称为空/处于中断/对象小于4个字节/大于32个页面
  7. if (!name || in_interrupt() || (size < BYTES_PER_WORD) || size > KMALLOC_MAX_SIZE) {
  8. BUG();
  9. }
  10. if (slab_is_available()) {
  11. get_online_cpus();
  12. // lock
  13. mutex_lock(&cache_chain_mutex);
  14. }
  15. // 此处删除了一个print log的代码,不影响主逻辑
  16. // 检查对象的大小是不是32位对齐,如果不是则进行调整
  17. if (size & (BYTES_PER_WORD - 1)) {
  18. size += (BYTES_PER_WORD - 1);
  19. size &= ~(BYTES_PER_WORD - 1);
  20. }
  21. // 检查Slab是不是按照硬件缓冲行对齐
  22. // SLAB_HWCACHE_ALIGN:必须将对象按照硬件对齐
  23. // 否则按照32对齐(4个字节)
  24. if (flags & SLAB_HWCACHE_ALIGN) {
  25. // 循环遍历,直到size > ralign / 2
  26. ralign = cache_line_size();
  27. while (size <= ralign / 2)
  28. ralign /= 2;
  29. } else {
  30. ralign = BYTES_PER_WORD;
  31. }
  32. // 看文档注释
  33. // SLAB_STORE_USER和SLAB_RED_ZONE用以debug
  34. if (flags & SLAB_STORE_USER)
  35. ralign = BYTES_PER_WORD;
  36. if (flags & SLAB_RED_ZONE) {
  37. ralign = REDZONE_ALIGN;
  38. size += REDZONE_ALIGN - 1;
  39. size &= ~(REDZONE_ALIGN - 1);
  40. }
  41. // ARCH_SLAB_MINALIGN为0,此处逻辑不会执行
  42. // 不知道这个玩意有啥用
  43. // ??????
  44. if (ralign < ARCH_SLAB_MINALIGN) {
  45. ralign = ARCH_SLAB_MINALIGN;
  46. }
  47. // 按照大的对齐
  48. // 用户的需求覆盖实际需要对齐的大小
  49. if (ralign < align) {
  50. ralign = align;
  51. }
  52. // 如果ralign大于unsigned long long
  53. // 关闭debug
  54. if (ralign > __alignof__(unsigned long long))
  55. flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
  56. // 覆盖对齐大小
  57. align = ralign;
  58. if (slab_is_available())
  59. gfp = GFP_KERNEL;
  60. else
  61. gfp = GFP_NOWAIT;
  62. // 用cache_cache分配一个kmem_cache
  63. cachep = kmem_cache_zalloc(&cache_cache, gfp);
  64. if (!cachep)
  65. goto oops;
  66. // 如果对象比较大,大于512字节并且不处于初始化阶段
  67. // 则slab描述符要处于page之外
  68. if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
  69. !(flags & SLAB_NOLEAKTRACE))
  70. flags |= CFLGS_OFF_SLAB;
  71. // 计算对齐的size
  72. size = ALIGN(size, align);
  73. // 按照align对齐的obj_size/mgmt_size还能空余的空间
  74. left_over = calculate_slab_order(cachep, size, align, flags);
  75. // 这种情况出现应该是对象太大,导致分配不了
  76. if (!cachep->num) {
  77. printk(KERN_ERR
  78. "kmem_cache_create: couldn't create cache %s.\n", name);
  79. kmem_cache_free(&cache_cache, cachep);
  80. cachep = NULL;
  81. goto oops;
  82. }
  83. slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
  84. + sizeof(struct slab), align);
  85. // 有可能对象超过limit,导致偏移大于slab_size
  86. // 这种情况可以将slab分配到内部,节省空间
  87. if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
  88. flags &= ~CFLGS_OFF_SLAB;
  89. left_over -= slab_size;
  90. }
  91. // 如果位于外面,则计算slab_size
  92. // 无需对齐
  93. if (flags & CFLGS_OFF_SLAB) {
  94. slab_size = cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
  95. /* If we're going to use the generic kernel_map_pages()
  96. * poisoning, then it's going to smash the contents of
  97. * the redzone and userword anyhow, so switch them off.
  98. */
  99. if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
  100. flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
  101. }
  102. cachep->colour_off = cache_line_size();
  103. if (cachep->colour_off < align)
  104. cachep->colour_off = align;
  105. // 可以偏移几次
  106. cachep->colour = left_over / cachep->colour_off;
  107. cachep->slab_size = slab_size;
  108. cachep->flags = flags;
  109. cachep->gfpflags = 0;
  110. if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
  111. cachep->gfpflags |= GFP_DMA;
  112. cachep->buffer_size = size;
  113. cachep->reciprocal_buffer_size = reciprocal_value(size);
  114. if (flags & CFLGS_OFF_SLAB) {
  115. cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
  116. /*
  117. * This is a possibility for one of the malloc_sizes caches.
  118. * But since we go off slab only for object size greater than
  119. * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
  120. * this should not happen at all.
  121. * But leave a BUG_ON for some lucky dude.
  122. */
  123. BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
  124. }
  125. cachep->ctor = ctor;
  126. cachep->name = name;
  127. if (setup_cpu_cache(cachep, gfp)) {
  128. __kmem_cache_destroy(cachep);
  129. cachep = NULL;
  130. goto oops;
  131. }
  132. list_add(&cachep->next, &cache_chain);
  133. oops:
  134. if (!cachep && (flags & SLAB_PANIC))
  135. panic("kmem_cache_create(): failed to create slab `%s'\n",
  136. name);
  137. if (slab_is_available()) {
  138. mutex_unlock(&cache_chain_mutex);
  139. put_online_cpus();
  140. }
  141. return cachep;
  142. }

5、alloc相关API

5.1、wrapper API

  1. // 基于cache_cache分配一个kmem_cache
  2. // 头文件有kmem_cache_zalloc封装kmem_cache_alloc
  3. void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
  4. {
  5. return __cache_alloc(cachep, flags, __builtin_return_address(0));
  6. }
  7. // 从给定node分配一个obj
  8. void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
  9. {
  10. return __cache_alloc_node(cachep, flags, nodeid, __builtin_return_address(0));
  11. }
  12. // 对外暴露的malloc
  13. void *__kmalloc(size_t size, gfp_t flags)
  14. {
  15. return __do_kmalloc(size, flags, NULL);
  16. }
  17. // 核心分配内存API
  18. static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, void *caller)
  19. {
  20. struct kmem_cache *cachep;
  21. void *ret;
  22. // 寻找符合条件的cachep
  23. cachep = __find_general_cachep(size, flags);
  24. if (unlikely(ZERO_OR_NULL_PTR(cachep)))
  25. return cachep;
  26. // 分配一个obj
  27. ret = __cache_alloc(cachep, flags, caller);
  28. return ret;
  29. }

5.2、core API

  1. // 从给定cachep获取一个obj
  2. // 附带很多检查,不影响主逻辑,已删除
  3. static __always_inline void *__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
  4. {
  5. void *objp;
  6. flags &= gfp_allowed_mask;
  7. // 内存申请故障注入,忽略
  8. if (slab_should_failslab(cachep, flags))
  9. return NULL;
  10. objp = __do_cache_alloc(cachep, flags);
  11. prefetchw(objp);
  12. if (unlikely((flags & __GFP_ZERO) && objp))
  13. memset(objp, 0, obj_size(cachep));
  14. return objp;
  15. }
  16. static __always_inline void *__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
  17. {
  18. void *objp;
  19. if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
  20. objp = alternate_node_alloc(cache, flags);
  21. if (objp)
  22. goto out;
  23. }
  24. // 先从本node分配内存
  25. objp = ____cache_alloc(cache, flags);
  26. // 再从其他node分配内存
  27. if (!objp)
  28. objp = ____cache_alloc_node(cache, flags, numa_node_id());
  29. out:
  30. return objp;
  31. }
  32. static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
  33. {
  34. void *objp;
  35. struct array_cache *ac;
  36. check_irq_off();
  37. // 优先从ac分配数据
  38. // ac数据没有,则进行fill
  39. ac = cpu_cache_get(cachep);
  40. if (likely(ac->avail)) {
  41. ac->touched = 1;
  42. objp = ac->entry[--ac->avail];
  43. } else {
  44. objp = cache_alloc_refill(cachep, flags);
  45. }
  46. return objp;
  47. }
  48. static __always_inline void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, void *caller)
  49. {
  50. void *ptr;
  51. flags &= gfp_allowed_mask;
  52. if (nodeid == -1)
  53. nodeid = numa_node_id();
  54. if (unlikely(!cachep->nodelists[nodeid])) {
  55. ptr = fallback_alloc(cachep, flags);
  56. goto out;
  57. }
  58. if (nodeid == numa_node_id()) {
  59. /*
  60. * Use the locally cached objects if possible.
  61. * However ____cache_alloc does not allow fallback
  62. * to other nodes. It may fail while we still have
  63. * objects on other nodes available.
  64. */
  65. ptr = ____cache_alloc(cachep, flags);
  66. if (ptr)
  67. goto out;
  68. }
  69. ptr = ____cache_alloc_node(cachep, flags, nodeid);
  70. out:
  71. if (unlikely((flags & __GFP_ZERO) && ptr))
  72. memset(ptr, 0, obj_size(cachep));
  73. return ptr;
  74. }
  75. static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
  76. {
  77. struct list_head *entry;
  78. struct slab *slabp;
  79. struct kmem_list3 *l3;
  80. void *obj;
  81. int x;
  82. l3 = cachep->nodelists[nodeid];
  83. retry:
  84. spin_lock(&l3->list_lock);
  85. entry = l3->slabs_partial.next;
  86. if (entry == &l3->slabs_partial) {
  87. l3->free_touched = 1;
  88. entry = l3->slabs_free.next;
  89. if (entry == &l3->slabs_free)
  90. goto must_grow;
  91. }
  92. slabp = list_entry(entry, struct slab, list);
  93. check_spinlock_acquired_node(cachep, nodeid);
  94. check_slabp(cachep, slabp);
  95. obj = slab_get_obj(cachep, slabp, nodeid);
  96. check_slabp(cachep, slabp);
  97. l3->free_objects--;
  98. list_del(&slabp->list);
  99. if (slabp->free == BUFCTL_END)
  100. list_add(&slabp->list, &l3->slabs_full);
  101. else
  102. list_add(&slabp->list, &l3->slabs_partial);
  103. spin_unlock(&l3->list_lock);
  104. goto done;
  105. must_grow:
  106. spin_unlock(&l3->list_lock);
  107. x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
  108. if (x)
  109. goto retry;
  110. return fallback_alloc(cachep, flags);
  111. done:
  112. return obj;
  113. }
  114. /*
  115. * Fallback function if there was no memory available and no objects on a
  116. * certain node and fall back is permitted. First we scan all the
  117. * available nodelists for available objects. If that fails then we
  118. * perform an allocation without specifying a node. This allows the page
  119. * allocator to do its reclaim / fallback magic. We then insert the
  120. * slab into the proper nodelist and then allocate from it.
  121. */
  122. static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
  123. {
  124. struct zonelist *zonelist;
  125. gfp_t local_flags;
  126. struct zoneref *z;
  127. struct zone *zone;
  128. enum zone_type high_zoneidx = gfp_zone(flags);
  129. void *obj = NULL;
  130. int nid;
  131. if (flags & __GFP_THISNODE)
  132. return NULL;
  133. zonelist = node_zonelist(slab_node(current->mempolicy), flags);
  134. local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
  135. retry:
  136. /*
  137. * Look through allowed nodes for objects available
  138. * from existing per node queues.
  139. */
  140. for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
  141. nid = zone_to_nid(zone);
  142. if (cpuset_zone_allowed_hardwall(zone, flags) &&
  143. cache->nodelists[nid] &&
  144. cache->nodelists[nid]->free_objects) {
  145. obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid);
  146. if (obj)
  147. break;
  148. }
  149. }
  150. if (!obj) {
  151. /*
  152. * This allocation will be performed within the constraints
  153. * of the current cpuset / memory policy requirements.
  154. * We may trigger various forms of reclaim on the allowed
  155. * set and go into memory reserves if necessary.
  156. */
  157. obj = kmem_getpages(cache, local_flags, numa_node_id());
  158. if (obj) {
  159. nid = page_to_nid(virt_to_page(obj));
  160. if (cache_grow(cache, flags, nid, obj)) {
  161. obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid);
  162. if (!obj)
  163. /*
  164. * Another processor may allocate the
  165. * objects in the slab since we are
  166. * not holding any locks.
  167. */
  168. goto retry;
  169. } else {
  170. /* cache_grow already freed obj */
  171. obj = NULL;
  172. }
  173. }
  174. }
  175. return obj;
  176. }