背景
一般上层由于业务需要,需要内核提供一些定制的内存回收接口。或者内核层本来就想做内存机制的优化。便需要在原有内存回收机制上做一些hook操作。所以了解Linux内核内存回收流程很重要
本章节主要讲kswapd线程,当内存低watermark时,kswapd会被唤醒并开始工作。
内存回收步骤
- kswapd初始化
1、设置每次swap的page数
2、创建所有numa节点对应的kswapd线程 - 项目2
- 项目3
内存回收详细步骤
- kswapd初始化
static int __init kswapd_init(void)
{int nid;swap_setup();//设置page_cluster,作用是确定每次swap in/out多少page(2^page_cluster)for_each_node_state(nid, N_MEMORY)//遍历所有numa节点kswapd_run(nid);//为每个numa节点创建kswapd线程return 0;
}
void kswapd_run(int nid)
{//创建节点id对应的kswapd线程pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
}
- 执行kswapd逻辑
static int kswapd(void *p)
{for ( ; ; ) {
kswapd_try_sleep://判断kswapd是否进入睡眠kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, highest_zoneidx);/* Read the new order and highest_zoneidx */alloc_order = READ_ONCE(pgdat->kswapd_order);highest_zoneidx = kswapd_highest_zoneidx(pgdat, highest_zoneidx);WRITE_ONCE(pgdat->kswapd_order, 0);WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);reclaim_order = balance_pgdat(pgdat, alloc_order, highest_zoneidx);if (reclaim_order < alloc_order)goto kswapd_try_sleep;}tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);return 0;
}static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,unsigned int highest_zoneidx)
{//将kswapd线程加入此内存节点的wait队列prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);/** Try to sleep for a short interval. Note that kcompactd will only be* woken if it is possible to sleep for a short interval. This is* deliberate on the assumption that if reclaim cannot keep an* eligible zone balanced that it's also unlikely that compaction will* succeed.*/if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {/** Compaction records what page blocks it recently failed to* isolate pages from and skips them in the future scanning.* When kswapd is going to sleep, it is reasonable to assume* that pages and compaction may succeed so reset the cache.*/reset_isolation_suitable(pgdat);/** We have freed the memory, now we should compact it to make* allocation of the requested order possible.*/wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);remaining = schedule_timeout(HZ/10);/** If woken prematurely then reset kswapd_highest_zoneidx and* order. The values will either be from a wakeup request or* the previous request that slept prematurely.*/if (remaining) {WRITE_ONCE(pgdat->kswapd_highest_zoneidx,kswapd_highest_zoneidx(pgdat,highest_zoneidx));if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)WRITE_ONCE(pgdat->kswapd_order, reclaim_order);}finish_wait(&pgdat->kswapd_wait, &wait);prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);}/** After a short sleep, check if it was a premature sleep. If not, then* go fully to sleep until explicitly woken up.*/if (!remaining &&prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {trace_mm_vmscan_kswapd_sleep(pgdat->node_id);/** vmstat counters are not perfectly accurate and the estimated* value for counters such as NR_FREE_PAGES can deviate from the* true value by nr_online_cpus * threshold. To avoid the zone* watermarks being breached while under pressure, we reduce the* per-cpu vmstat threshold while kswapd is awake and restore* them before going back to sleep.*/set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);if (!kthread_should_stop())schedule();set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);} else {if (remaining)count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);elsecount_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);}finish_wait(&pgdat->kswapd_wait, &wait);
}
- 尽情期待
源码
参考kernel 6.1