linux之调度管理(5)-实时调度器

一、概述

在Linux内核中，实时进程总是比普通进程的优先级要高，实时进程的调度是由Real Time Scheduler(RT调度器)来管理，而普通进程由CFS调度器来管理。

实时进程支持的调度策略为：SCHED_FIFO和SCHED_RR。

SCHED_FIFO：没有时间片，先进先出，在被调度器选择后，可以运行任意长时间。
SCHED_RR：没有时间片，其值在进程运行时会减少，就像普通进程一样。在所有的时间段都到期后，则该值重置为初始值，而进程则置于队列末尾。这确保了在有几个优先级相同的SCHED_RR进程情况下，它们总是依次执行

使用场景：

SCHED_FIFO应用场景，比如时钟域隔离，不同宽度数据接口（如嵌入式单片机8位和16位DSP数据传送，在单片机和DSP连接达到数据匹配目的）。
SCHED_RR，需要定时切换场景或轮流实现操作场景

二、数据结构

struct rq：运行队列，每个CPU都对应一个；
struct rt_rq：实时运行队列，用于管理实时任务的调度实体；
struct sched_rt_entity：实时调度实体，用于参与调度，功能与struct sched_entity类似；
struct task_group：组调度结构体；
struct rt_bandwidth：带宽控制结构体；

先上张图，捋捋这些结构之间的关系:

从图中的结构组织关系看，与CFS调度器基本一致，区别在与CFS调度器是通过红黑树来组织调度实体，而RT调度器使用的是优先级队列来组织实时调度实体；
rt_rq运行队列，维护了100个优先级的队列（链表），优先级0-99，从高到底；
调度器管理的对象是调度实体，任务task_struct和任务组task_group都是通过内嵌调度实体的数据结构，来最终参与调度管理的；
task_group任务组调度，自身为每个CPU维护rt_rq，用于存放自己的子任务或者子任务组，子任务组又能往下级联，因此可以构造成树；

下面是关键结构体：

struct sched_rt_entity {struct list_head		run_list;   //用于加入到优先级队列中unsigned long			timeout;    //设置的时间超时unsigned long			watchdog_stamp;     //用于记录jiffies值unsigned int			time_slice;     //时间片，100ms，unsigned short			on_rq;unsigned short			on_list;struct sched_rt_entity		*back;  //临时用于从上往下连接RT调度实体时使用
#ifdef CONFIG_RT_GROUP_SCHEDstruct sched_rt_entity		*parent;    //指向父RT调度实体/* rq on which this entity is (to be) queued: */struct rt_rq			*rt_rq;     //RT调度实体所属的实时运行队列，被调度/* rq "owned" by this entity/group: */struct rt_rq			*my_q;  //RT调度实体所拥有的实时运行队列，用于管理子任务或子组任务
#endif
} __randomize_layout;


//kernel/kernel/sched/sched.h/* Real-Time classes' related field in a runqueue: */
struct rt_rq {struct rt_prio_array active;    //优先级队列，100个优先级的链表，并定义了位图，用于快速查询unsigned int rt_nr_running; //在RT运行队列中所有活动的任务数unsigned int rr_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHEDstruct {int curr; /* highest queued rt task prio */     //当前RT任务的最高优先级
#ifdef CONFIG_SMPint next; /* next highest */        //下一个要运行的RT任务的优先级，如果两个任务都有最高优先级，则curr == next
#endif} highest_prio;
#endif
#ifdef CONFIG_SMPunsigned long rt_nr_migratory;      //任务没有绑定在某个CPU上时，这个值会增减，用于任务迁移unsigned long rt_nr_total;      //用于overload检查int overloaded;     //RT运行队列过载，则将任务推送到其他CPUstruct plist_head pushable_tasks;   //优先级列表，用于推送过载任务
#endif /* CONFIG_SMP */int rt_queued;  //表示RT运行队列已经加入rq队列int rt_throttled;   //用于限流操作u64 rt_time;    //累加的运行时，超出了本地rt_runtime时，则进行限制u64 rt_runtime;     //分配给本地池的运行时/* Nests inside the rq lock: */raw_spinlock_t rt_runtime_lock;#ifdef CONFIG_RT_GROUP_SCHEDunsigned long rt_nr_boosted;    //用于优先级翻转问题解决struct rq *rq;      //指向运行队列struct task_group *tg;  //指向任务组
#endif
};

struct rt_bandwidth {/* nests inside the rq lock: */raw_spinlock_t		rt_runtime_lock;ktime_t			rt_period;      //时间周期u64			rt_runtime;     //一个时间周期内的运行时间，超过则限流，默认值为95%struct hrtimer		rt_period_timer;    //时间周期定时器unsigned int		rt_period_active;
};

struct rt_prio_array {DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */struct list_head queue[MAX_RT_PRIO];
};

定义了位图，代表100（0-99）个优先级队列。如上面图所示，就是按照优先级把进程放入对应的优先级队列中。

三、进程的创建

内核中实时调度器的定义如下：

const struct sched_class rt_sched_class__section("__rt_sched_class") = {.enqueue_task           = enqueue_task_rt,//入队.dequeue_task           = dequeue_task_rt,//出队.yield_task             = yield_task_rt,//放弃主动权.check_preempt_curr     = check_preempt_curr_rt,.pick_next_task         = pick_next_task_rt,//调度器中选择哪个任务要被调度.put_prev_task          = put_prev_task_rt,/当一个任务将要被调度出执行.set_next_task          = set_next_task_rt,#ifdef CONFIG_SMP.balance                = balance_rt,.select_task_rq         = select_task_rq_rt,.set_cpus_allowed       = set_cpus_allowed_common,.rq_online              = rq_online_rt,.rq_offline             = rq_offline_rt,.task_woken             = task_woken_rt,.switched_from          = switched_from_rt,.find_lock_rq           = find_lock_lowest_rq,
#endif.task_tick              = task_tick_rt,.get_rr_interval        = get_rr_interval_rt,.prio_changed           = prio_changed_rt,.switched_to            = switched_to_rt,.update_curr            = update_curr_rt,#ifdef CONFIG_UCLAMP_TASK.uclamp_enabled         = 1,
#endif
};

上面rt_sched_class 的定义时，与fair_sched_class 有一个很大的区别是：没有实现 task_fork回调函数。所有在创建进程时，与CFS调度器有个区别，代码如下：

void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
{unsigned long flags;rseq_migrate(p);/** We're setting the CPU for the first time, we don't migrate,* so use __set_task_cpu().*/__set_task_cpu(p, smp_processor_id());if (p->sched_class->task_fork)p->sched_class->task_fork(p); //执行调度器的回调函数raw_spin_unlock_irqrestore(&p->pi_lock, flags);

实时调度器是没有task_fork 函数的。

copy_process()
sched_fork()
__sched_fork()
sched_post_fork（）
****
wake_up_new_task()
activate_task()
enqueue_task
rt_sched_class->enqueue_task-->enqueue_task_rt()

下面看一下调度实体入队：

/** Adding/removing a task to/from a priority array:*/
static void
enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{struct sched_rt_entity *rt_se = &p->rt;if (flags & ENQUEUE_WAKEUP) // 入队flagsrt_se->timeout = 0;enqueue_rt_entity(rt_se, flags);
//进程如果运行在多个CPU上，将任务添加到pushable 链表上if (!task_current(rq, p) && p->nr_cpus_allowed > 1)enqueue_pushable_task(rq, p);
}

enqueue_rt_entity：

static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{struct rq *rq = rq_of_rt_se(rt_se);dequeue_rt_stack(rt_se, flags);for_each_sched_rt_entity(rt_se)__enqueue_rt_entity(rt_se, flags);enqueue_top_rt_rq(&rq->rt);
}

这里其实是由有一个问题：为什么实时调度体入队时，需要先把rt_se已经到顶部rt_se 出队？

enqueue_task_rt和dequeue_task_rt都会调用dequeue_rt_stack接口，当请求的rt_se对应的是任务组时，会从顶部到请求的rt_se将调度实体出列；

当RT任务进行出队入队时，通过enqueue_task_rt/dequeue_task_rt两个接口来完成，调用流程如下：

enqueue_task_rt和dequeue_task_rt都会调用dequeue_rt_stack接口，当请求的rt_se对应的是任务组时，会从顶部到请求的rt_se将调度实体出列；
任务添加到rt运行队列时，如果存在多个任务可以分配给多个CPU，设置overload，用于任务的迁移；

这上面有一个RT调度器的核心函数：update_curr_rt ，运行时的统计数据更新。后面具体分析这个函数。

四、进程创建和入队之后，就是如何被调度

4.1 主调度器调度或者主动执行调度函数

__schedule()
->pick_next_task()

        ->put_prev_task_balance(选择下一个实时进程前，进行平衡处理，和SMP有关系)

        ->balance_rt

->need_pull_rt_task() ->pull_rt_task()

    ->pick_next_task_rt() （选择下一个实时进程）
->context_switch()
->switch_mm()
->cpu_*_switch_mm()
->switch_to()
->__switch_to

pick_next_task_rt函数是调度器用于选择下一个执行任务。流程如下：

与CFS调度器不同，RT调度器会在多个CPU组成的domain中，对任务进行pull/push处理，也就是说，如果当前CPU的运行队列中任务优先级都不高，那么会考虑去其他CPU运行队列中找一个更高优先级的任务来执行，以确保按照优先级处理，此外当前CPU也会把任务推送到其他更低优先级的CPU运行队列上。
_pick_next_task_rt的处理逻辑比较简单，如果实时调度实体是task，则直接查找优先级队列的位图中，找到优先级最高的任务，而如果实时调度实体是task_group，则还需要继续往下进行遍历查找；

上面的流程图，稍微和内核5.10.* 的版本有点不同，下面看一下pick_next_task函数：

** Pick up the highest-prio task:*/
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{const struct sched_class *class;struct task_struct *p;/** Optimization: we know that if all tasks are in the fair class we can* call that function directly, but only if the @prev task wasn't of a* higher scheduling class, because otherwise those loose the* opportunity to pull in more work from other CPUs.*/if (likely(prev->sched_class <= &fair_sched_class &&rq->nr_running == rq->cfs.h_nr_running)) {p = pick_next_task_fair(rq, prev, rf);if (unlikely(p == RETRY_TASK))goto restart;/* Assumes fair_sched_class->next == idle_sched_class */if (!p) {put_prev_task(rq, prev);p = pick_next_task_idle(rq);}return p;}restart:put_prev_task_balance(rq, prev, rf);for_each_class(class) {p = class->pick_next_task(rq);if (p)return p;}/* The idle class should always have a runnable task: */BUG();
}

在restart 分断处，有put_prev_task_balance函数，就是执行实时调度器的在SMP时的平衡处理，即balance_rt函数。所谓平衡处理就是：RT调度器会在多个CPU组成的domain中，对任务进行pull/push处理。

关于任务的pull/push，linux提供了struct plist，基于优先级的双链表，其中任务的组织关系如下图：

pushable_tasks 链表：即可以推到其他CPU 上的进程

pull_rt_task的大概示意图如下：

当前CPU上的优先级任务不高，从另一个CPU的pushable_tasks链表中找优先级更高的任务来执行；

上面的pull_rt_task 就是实现实时调度进程的SMP均衡。

4.2 系统时钟周期调度

scheduer_tick->sched_class->task_tick_rt

static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{struct sched_rt_entity *rt_se = &p->rt;update_curr_rt(rq);//更新时间update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);watchdog(rq, p);/** RR tasks need a special form of timeslice management.* FIFO tasks have no timeslices.*/if (p->policy != SCHED_RR)//先进先出的进程没有时间片，直接退出 return;//当前是RR进程，则减少时间片if (--p->rt.time_slice)return;//重置时间片100msp->rt.time_slice = sched_rr_timeslice;/** Requeue to the end of queue if we (and all of our ancestors) are not* the only element on the queue*///如果该进程不是唯一进程，则排到队尾for_each_sched_rt_entity(rt_se) {if (rt_se->run_list.prev != rt_se->run_list.next) {requeue_task_rt(rq, p, 0);resched_curr(rq);return;}}
}

update_curr_rt 函数是实时调度的核心函数。主要是更新调度信息

/** Update the current task's runtime statistics. Skip current tasks that* are not in our scheduling class.*/
static void update_curr_rt(struct rq *rq)
{struct task_struct *curr = rq->curr;struct sched_rt_entity *rt_se = &curr->rt;u64 delta_exec;u64 now;//判断是否有实时调度进程if (curr->sched_class != &rt_sched_class)return;now = rq_clock_task(rq);//执行时间delta_exec = now - curr->se.exec_start;if (unlikely((s64)delta_exec <= 0))return;schedstat_set(curr->se.statistics.exec_max,max(curr->se.statistics.exec_max, delta_exec));//更新当前进程总的执行时间curr->se.sum_exec_runtime += delta_exec;account_group_exec_runtime(curr, delta_exec);//更新执行开始时间curr->se.exec_start = now;cgroup_account_cputime(curr, delta_exec);if (!rt_bandwidth_enabled())return;for_each_sched_rt_entity(rt_se) {struct rt_rq *rt_rq = rt_rq_of_se(rt_se);int exceeded;if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {raw_spin_lock(&rt_rq->rt_runtime_lock);rt_rq->rt_time += delta_exec;exceeded = sched_rt_runtime_exceeded(rt_rq);if (exceeded)resched_curr(rq);raw_spin_unlock(&rt_rq->rt_runtime_lock);if (exceeded)do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));}}
}

4.3 运行时统计数据

运行时的统计数据更新，是在update_curr_rt函数中完成的：

update_curr_rt函数功能，主要包括两部分：
1. 运行时间的统计更新处理；
2. 如果运行时间超出了分配时间，进行时间均衡处理，并且判断是否需要进行限流，进行了限流则需要将RT队列出队，并重新进行调度；

为了更直观的理解，下边还是来两张图片说明一下：

sched_rt_avg_update更新示意如下：

rq->age_stamp：在CPU启动后运行队列首次运行时设置起始时间，后续周期性进行更新；
rt_avg：累计的RT平均运行时间，每0.5秒减半处理，用于计算CFS负载减去RT在CFS负载平衡中使用的时间百分比；

这上面有一个问题，会导致内核报RT_THRROLLING的问题，即实时进程运行超时，会向其他CPU 借用时间，导致运行时间大于100ms 时间片的95%.

这个问题在最后解释。

五、组调度

和CFS调度器类似，看一下RT调度器组调度的组织关系图吧：

系统为每个CPU都分配了RT运行队列，以及RT调度实体，任务组通过它包含的RT调度实体来参与调度；
任务组task_group的RT队列，用于存放归属于该组的任务或子任务组，从而形成级联的结构；

看一下实际的组织示意图：

六、带宽控制

RT调度器在带宽控制中，调度时间周期设置的为1s，运行时间设置为0.95s：

/** period over which we measure -rt task CPU usage in us.* default: 1s*/
unsigned int sysctl_sched_rt_period = 1000000;/** part of the period that we allow rt tasks to run in us.* default: 0.95s*/
int sysctl_sched_rt_runtime = 950000;

这两个值可以在用户态通过/sys/fs/cgroup/cpu/rt_runtime_us和/sys/fs/cgroup/cpu/rt_period_us来进行设置。

看看函数调用流程：

init_rt_bandwidth函数在创建分配RT任务组的时候调用，完成的工作是将rt_bandwidth结构体的相关字段进行初始化：设置好时间周期rt_period和运行时间限制rt_runtime，都设置成默认值；
可以从用户态通过操作/sys/fs/cgroup/cpu下对应的节点进行设置rt_period和rt_runtime，最终调用的函数是tg_set_rt_bandwidth，在该函数中会从下往上的遍历任务组进行设置时间周期和限制的运行时间；
在enqueue_rt_entity将RT调度实体入列时，最终触发start_rt_bandwidth函数执行，当高精度定时器到期时调用do_sched_rt_period_timer函数；
do_sched_rt_period_timer函数，会去判断该RT运行队列的累计运行时间rt_time与设置的限制运行时间rt_runtime之间的大小关系，以确定是否限流的操作。在这个函数中，如果已经进行了限流操作，会调用balance_time来在多个CPU之间进行时间均衡处理，简单点说，就是从其他CPU的rt_rq队列中匀出一部分时间增加到当前CPU的rt_rq队列中，也就是将当前rt_rt运行队列的限制运行时间rt_runtime增加一部分，其他CPU的rt_rq运行队列限制运行时间减少一部分。

来一张效果示意图：

这个上面的定时器处理函数来实现带宽控制，是和CFS调度器是一样的。

七、RT-throttling 出现的问题分析

7.1 出现问题的背景：

Linux 上调度策略为SCHED_FIFO的实时进程是根据优先级抢占运行的。当没有更高优先级的实时进程抢占，而此进程又由于bug等原因长时间运行，不调度其它进程，系统就会出现无响应。这里要分析的RT throttling就是针对此种情况的，它通过限制每个单位时间内分配给实时进程的CPU运行时间，来防止上述情况的出现。

在这里补充一点，内核实时调度还有RR策略，RR调度的时间片是100ms,假设此时有两个RR进程，调度器就是以100ms为周期单位，进行来回调度，也有可能出现RT-throttling的内核日志。

此文章中，谈论简单的FIFO的实时进程。

7.2 系统中的内核配置参数

标准的设置是1s的时间内，实时进程的运行时间是950ms，其余的50ms时间给normal进程使用。

sched_rt_period_us值为1000000us=1s，表示单位时间为1s

sched_rt_runtime_us值为950000us=0.95s，表示实时进程的运行时间为0.95s。

这两个接口的实现代码如下：

//kernel/kernel/sysctl.cstatic struct ctl_table kern_table[] = {....{.procname       = "sched_rt_period_us",.data           = &sysctl_sched_rt_period,.maxlen         = sizeof(unsigned int),.mode           = 0644,.proc_handler   = sched_rt_handler,},{.procname       = "sched_rt_runtime_us",.data           = &sysctl_sched_rt_runtime,.maxlen         = sizeof(int),.mode           = 0644,.proc_handler   = sched_rt_handler,},
....}

sched_rt_period_us接口设置的是sysctl_sched_rt_period变量，sched_rt_runtime_us接口设置的是sysctl_sched_rt_runtime变量。读写的实现都是通过sched_rt_handler函数，这里就不具体分析了。

int sched_rt_handler(struct ctl_table *table, int write, void *buffer,size_t *lenp, loff_t *ppos)
{int old_period, old_runtime;static DEFINE_MUTEX(mutex);int ret;mutex_lock(&mutex);old_period = sysctl_sched_rt_period;old_runtime = sysctl_sched_rt_runtime;ret = proc_dointvec(table, write, buffer, lenp, ppos);if (!ret && write) {ret = sched_rt_global_validate();if (ret)goto undo;ret = sched_dl_global_validate();if (ret)goto undo;ret = sched_rt_global_constraints();if (ret)goto undo;sched_rt_do_global();sched_dl_do_global();// 和dl 调度器也有关系}if (0) {
undo:sysctl_sched_rt_period = old_period;sysctl_sched_rt_runtime = old_runtime;}mutex_unlock(&mutex);return ret;
}

7.3 设置RT throttling，那么它是如何工作的？

一个实时调度器的核心函数update_curr_rt：

static void update_curr_rt(struct rq *rq)
{struct task_struct *curr = rq->curr;struct sched_rt_entity *rt_se = &curr->rt;u64 delta_exec;u64 now;if (curr->sched_class != &rt_sched_class)/*判断当前进程调度类*/return;//*运行队列现在的时间与当前进程开始运行时间之差* */now = rq_clock_task(rq);delta_exec = now - curr->se.exec_start;/*更新进程的真实运行时间*/    if (unlikely((s64)delta_exec <= 0))return;schedstat_set(curr->se.statistics.exec_max,max(curr->se.statistics.exec_max, delta_exec));curr->se.sum_exec_runtime += delta_exec;account_group_exec_runtime(curr, delta_exec);curr->se.exec_start = now;cgroup_account_cputime(curr, delta_exec);if (!rt_bandwidth_enabled()) /*判断RT throttling是否开启*/return;for_each_sched_rt_entity(rt_se) {/*/*遍历此实时进程的调度单元*/*/struct rt_rq *rt_rq = rt_rq_of_se(rt_se);int exceeded;if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {raw_spin_lock(&rt_rq->rt_runtime_lock);rt_rq->rt_time += delta_exec; // /*rt_rq的运行时间是否超过了分配给它的时间片*/     exceeded = sched_rt_runtime_exceeded(rt_rq);if (exceeded)resched_curr(rq);raw_spin_unlock(&rt_rq->rt_runtime_lock);if (exceeded)//如果超过，开始限制do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));}}
}

update_curr_rt函数用来更新当前实时进程的运行时间统计值，如果当前进程不是实时进程，即调度类不为rt_sched_class，则直接返回。
delta_exec值为此运行队列的当前时间与此进程开始运行时间之差，也即是此进程此次调度运行的时长。然后更新进程的真实运行时间和开始运行时间。
rt_bandwidth_enabled函数判断sysctl_sched_rt_runtime变量值是否大于0，如果此变量值设置为RUNTIME_INF(很大的负数)，就关掉了RT throttling功能，这里就会直接返回。
然后遍历此实时进程的调度实体，找到相应的就绪队列，更新运行时间后，通过sched_rt_runtime_exceeded函数判断是否此实时进程是否超过了分配给它的时间片。

接着看函数：sched_rt_runtime_exceeded()

//kernel/kernel/sched/rt.cstatic int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{u64 runtime = sched_rt_runtime(rt_rq);/*获取当前队列的最大运行时间*/  if (rt_rq->rt_throttled)/*当前队列的实时调度受到限制*/     return rt_rq_throttled(rt_rq);/*当前队列的最大运行时间大于当前队列的调度周期时间*/if (runtime >= sched_rt_period(rt_rq))return 0;balance_runtime(rt_rq);runtime = sched_rt_runtime(rt_rq); /*重新获取当前队列的最大运行时间*/if (runtime == RUNTIME_INF)/*关闭了RT throttling*/return 0;if (rt_rq->rt_time > runtime) {/*累计运行时间大于最大运行时间*/             struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);/** Don't actually throttle groups that have no runtime assigned* but accrue some time due to boosting.*/if (likely(rt_b->rt_runtime)) {rt_rq->rt_throttled = 1;printk_deferred_once("sched: RT throttling activated\n");} else {/** In case we did anyway, make it go away,* replenishment is a joke, since it will replenish us* with exactly 0 ns.*/rt_rq->rt_time = 0;}if (rt_rq_throttled(rt_rq)) {/*检查队列的实时调度是否受到限制*/sched_rt_rq_dequeue(rt_rq); /*将调度实体从实时运行队列中删除*/return 1;}}return 0;
}

runtime值为当前队列的最大运行时间rt_runtime。rt_throttled字段表示当前队列的实时调度是否受到限制，如果受到限制了，就直接返回1，在update_curr_rt函数中就会调用resched_task函数执行进程切换，让出cpu。
如果当前队列的最大运行时间大于当前队列的调度周期时间，则返回0，这样此运行队列上的任务还能够继续运行。
balance_runtime函数在RT_RUNTIME_SHARE特性使能的情况下，如果当前队列的运行时间超过了最大运行时间，则可以从其他cpu上借用时间。具体代码这里先不分析，后面分析。
重新获取当前队列的最大运行时间runtime，如果值等于RUNTIME_INF说明关闭了RT throttling，则直接返回0。
如果累计运行时间大于最大运行时间，就会执行上面的代码片段。rt_b为运行队列rt_rq的进程组带宽控制结构体指针，如果rt_runtime即此进程组的任务运行时间额度值有效，则设置rt_throttled为1，表明此队列的实时调度受到限制，并打印出“sched: RT throttling activated”信息。接着检查队列的实时调度如果受到限制，则返回1，在update_curr_rt函数中让出cpu。

下面讲一下从其他CPU上借用时间：

sched_rt_runtime_exceeded() ->balance_runtime();balance_runtime在当前队列运行时间超过最大运行时间后，可以从其他cpu上借用时间.

static void balance_runtime(struct rt_rq *rt_rq)
{if (!sched_feat(RT_RUNTIME_SHARE)) /*RT_RUNTIME_SHARE支持多个cpu间的rt_runtime共享*/return;if (rt_rq->rt_time > rt_rq->rt_runtime) {raw_spin_unlock(&rt_rq->rt_runtime_lock);do_balance_runtime(rt_rq);raw_spin_lock(&rt_rq->rt_runtime_lock);}
}

RT_RUNTIME_SHARE默认是使能的,SCHED_FEAT(RT_RUNTIME_SHARE, true),位于 kernel/sched/features.h文件中。

它表示支持多个cpu间的rt_runtime共享。如果不支持的话，就直接返回。

如果当前队列的累计运行时间大于最大运行时间，则调用do_balance_runtime函数。

do_balance_runtime：

//kernel/kernel/sched/rt.cstatic void do_balance_runtime(struct rt_rq *rt_rq)
{struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;int i, weight;u64 rt_period;weight = cpumask_weight(rd->span);raw_spin_lock(&rt_b->rt_runtime_lock);rt_period = ktime_to_ns(rt_b->rt_period);/*任务组一个控制周期的时间*/     for_each_cpu(i, rd->span) {/*找到在另一个cpu上运行的同一任务组的运行队列*/struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);s64 diff;if (iter == rt_rq)/*同一运行队列则跳过*/continue;raw_spin_lock(&iter->rt_runtime_lock);/** Either all rqs have inf runtime and there's nothing to steal* or __disable_runtime() below sets a specific rq to inf to* indicate its been disabled and disalow stealing.*/if (iter->rt_runtime == RUNTIME_INF)goto next;/*RT throttling关闭，不允许借用时间*//** From runqueues with spare time, take 1/n part of their* spare time, but no more than our period.*//*最大能够借用时间*/diff = iter->rt_runtime - iter->rt_time;if (diff > 0) {diff = div_u64((u64)diff, weight);if (rt_rq->rt_runtime + diff > rt_period)diff = rt_period - rt_rq->rt_runtime; /*修正后可借用*/iter->rt_runtime -= diff;rt_rq->rt_runtime += diff;/*满足条件退出，否则继续从其他cpu借用*/if (rt_rq->rt_runtime == rt_period) {raw_spin_unlock(&iter->rt_runtime_lock);break;}}
next:raw_spin_unlock(&iter->rt_runtime_lock);}raw_spin_unlock(&rt_b->rt_runtime_lock);
}

rd->span表示此调度域的rq可运行的cpu的一个mask，这里会遍历此mask上的cpu，如果对应的cpu的rq和当前的rq是同一运行队列，则直接跳过；如果对应的cpu的rq已关闭RT throttling功能，则不允许借用时间。内核中关于这块代码的注释是：

Either all rqs have inf runtime and there's nothing to steal or __disable_runtime() below sets a specific rq to inf to indicate its been disabled and disalow stealing.

大概意思是如果所有的运行队列都设置为RUNTIME_INF即关闭了RT throttling功能，则没有时间可以借用。或者某个指定的运行队列调用__disable_runtime()函数，则不允许别的借用自己的时间。

diff是iter运行队列最大能够借用的时间，后面经过修正后，将diff加入到rt_rq的最大可运行时间上。如果新的最大可运行时间等于此任务组的控制周期的时间，则不需要接着再从其他的CPU上借用时间，就直接break退出。

实时进程所在的cpu占用超时，可以向其他的CPU借用，将其他CPU的时间借用过来，这样此实时进程所在的CPU占有率达到100%，这样做的目的是为了避免实时进程由于缺少CPU时间而向其他的CPU迁移，减少不必要的迁移成本。此cpu上为绑定核的普通进程可以迁移到其他cpu上，这样就会得到调度。但是如果此CPU上有进程只绑定在此CPU上，那么就只有在这里饿死了。

所以，此时占用率达到100%，所以会内核日志会出现：sched: RT throttling activated。因为是先借用时间，在判断是否打印内核日志。

7.4 总结

rt_rq实时进程运行队列里面提到rt_time和rt_runtime，一个是运行累计时间，一个是最大运行时间，当运行累计时间超过最大运行时间的时候，rt_throttled则被设置为1。

补充一个隐藏时间算法：

其实还有一个隐藏的时间概念，即sched_rt_period_us（1s），意味着sched_rt_period_us时间内，实时进程可以占用CPU rt_runtime时间，如果实时进程每个时间周期内都没有调度，则在do_sched_rt_period_timer定时器函数中将rt_time减去一个周期，然后比较rt_runtime，恢复rt_throttled。

init_rt_bandwidth()->rt_b->rt_period_timer.function = sched_rt_period_timer;

sched_rt_period_timer()->do_sched_rt_period_timer()

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
{int i, idle = 1, throttled = 0;const struct cpumask *span;....
raw_spin_lock(&rt_rq->rt_runtime_lock);if (rt_rq->rt_throttled)balance_runtime(rt_rq);runtime = rt_rq->rt_runtime;//overrun来自对周期时间定时器误差的校正rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {rt_rq->rt_throttled = 0;enqueue = 1;
...
}