嵌入式Linux性能调优核心战场,从代码层面彻底吃透DVFS机制
作为嵌入式Linux开发者,你是否只停留在echo performance > scaling_governor的表层操作?当遇到频率切换延迟高、电压调节失败、多核异构频率协同等问题时,是否束手无策?
今天这篇文章,将逐行剖析RK3576平台的cpufreq实现,从源码层面拆解Linux内核DVFS(动态电压频率调节)的每一个细节,带你从“会用”到“精通”,解决实际开发中的核心痛点。
一、为什么要深入cpufreq源码?
嵌入式Linux开发中,cpufreq是性能与功耗平衡的核心,但这些问题往往让开发者卡壳:
•频率切换延迟过高,如何优化?
•电压调节失败,如何定位?
•多核异构场景下,频率如何协同?
•自定义调速器,如何接入框架?
只有深入源码,才能真正掌握DVFS的底层逻辑,而非停留在“调参数”的表面操作。
二、整体架构:代码视角的分层设计
2.1文件组织与模块依赖
cpufreq驱动的代码分布清晰,核心文件各司其职:
drivers/cpufreq/├── cpufreq.c # 核心框架:策略管理、通知链、sysfs├── cpufreq-dt.c # DT 通用驱动:OPP 解析、时钟操作├── cpufreq-dt.h # DT 驱动接口定义├── rockchip-cpufreq.c # RK 平台驱动:SoC 特性、DVFS 锁、监控├── rockchip-cpufreq.h # RK 驱动导出接口├── cpufreq_governor.c # 调速器公共框架├── cpufreq_governor.h # 调速器数据结构├── cpufreq_interactive.c # Interactive 调速器实现├── cpufreq_ondemand.c # Ondemand 调速器实现├── freq_table.c # 频率表辅助函数└── cpufreq_stats.c # 统计信息模块
2.2核心结构体关系(代码定义)
cpufreq的核心逻辑围绕两个结构体展开,是理解整个框架的关键:
// include/linux/cpufreq.hstructcpufreq_policy{cpumask_var_t cpus; // 本策略管理的 CPU 掩码cpumask_var_t related_cpus; // 硬件相关的 CPU(共享时钟)unsignedint min; // 用户设置的最小频率unsignedint max; // 用户设置的最大频率unsignedint cur; // 当前频率(kHz)structcpufreq_governor*governor; // 当前调速器void *governor_data;// 调速器私有数据structcpufreq_frequency_table*freq_table;// 频率表structcpufreq_stats *stats; // 统计信息structkobject kobj; // sysfs 对象void *driver_data; // 驱动私有数据// ... 更多字段};structcpufreq_driver{char name[CPUFREQ_NAME_LEN];unsignedint flags;/* 必须实现的回调 */int(*target_index)(structcpufreq_policy *policy,unsignedintindex);unsignedint(*get)(unsignedintcpu);/* 可选回调 */int(*init)(structcpufreq_policy *policy);int(*exit)(structcpufreq_policy *policy);int(*online)(structcpufreq_policy *policy);int(*offline)(structcpufreq_policy *policy);// ...};
脑图:RK3576 CPUFreq核心架构
三、初始化流程:从模块加载到频率就绪
RK3576的cpufreq初始化是“先集群初始化,再注册驱动”的逻辑,核心分为三步:
3.1驱动入口:rockchip_cpufreq_driver_init
这是整个RK平台cpufreq的入口函数,核心逻辑是遍历CPU集群、初始化集群信息、注册通知链和平台设备:
// drivers/cpufreq/rockchip-cpufreq.cstaticint__initrockchip_cpufreq_driver_init(void){structcluster_info *cluster, *pos;structcpufreq_dt_platform_data pdata = {0};intcpu, ret;boolis_opp_shared_cpu_bus =false;/* 遍历所有可能的 CPU,为每个 cluster 初始化 */for_each_possible_cpu(cpu) {cluster = rockchip_cluster_info_lookup(cpu);if(cluster)continue; // 已初始化过cluster = kzalloc(sizeof(*cluster), GFP_KERNEL);if(!cluster) {ret = -ENOMEM;gotorelease_cluster_info;}/* 核心初始化:解析 OPP、获取 regulator、设置 SoC 信息 */ret = rockchip_cpufreq_cluster_init(cpu, cluster);if(ret) {pr_err("Failed to initialize dvfs info cpu%dn", cpu);gotorelease_cluster_info;}list_add(&cluster->list_head, &cluster_info_list);if(cluster->is_opp_shared_cpu_bus)is_opp_shared_cpu_bus =true;}/* 设置平台数据 */pdata.have_governor_per_policy =true;pdata.suspend = rockchip_cpufreq_suspend;/* 注册 cpufreq 通知链 */ret = cpufreq_register_notifier(&rockchip_cpufreq_notifier_block,CPUFREQ_POLICY_NOTIFIER);if(ret) {pr_err("failed to register cpufreq notifiern");gotorelease_cluster_info;}/* 如果存在 OPP 共享总线,注册 transition 通知器 */if(is_opp_shared_cpu_bus) {ret = cpufreq_register_notifier(&rockchip_cpufreq_transition_notifier_block,CPUFREQ_TRANSITION_NOTIFIER);// ...}/* 注册 panic 通知器,用于调试 */ret = atomic_notifier_chain_register(&panic_notifier_list,&rockchip_cpufreq_panic_notifier_block);/* 注册 platform_device,触发 cpufreq-dt 驱动 probe */returnPTR_ERR_OR_ZERO(platform_device_register_data(NULL,"cpufreq-dt",-1, (void*)&pdata,sizeof(structcpufreq_dt_platform_data)));}module_init(rockchip_cpufreq_driver_init);// drivers/cpufreq/rockchip-cpufreq.cstaticint__initrockchip_cpufreq_driver_init(void){structcluster_info *cluster, *pos;structcpufreq_dt_platform_data pdata = {0};intcpu, ret;boolis_opp_shared_cpu_bus =false;/* 遍历所有可能的 CPU,为每个 cluster 初始化 */for_each_possible_cpu(cpu) {cluster = rockchip_cluster_info_lookup(cpu);if(cluster)continue; // 已初始化过cluster = kzalloc(sizeof(*cluster), GFP_KERNEL);if(!cluster) {ret = -ENOMEM;gotorelease_cluster_info;}/* 核心初始化:解析 OPP、获取 regulator、设置 SoC 信息 */ret = rockchip_cpufreq_cluster_init(cpu, cluster);if(ret) {pr_err("Failed to initialize dvfs info cpu%dn", cpu);gotorelease_cluster_info;}list_add(&cluster->list_head, &cluster_info_list);if(cluster->is_opp_shared_cpu_bus)is_opp_shared_cpu_bus =true;}/* 设置平台数据 */pdata.have_governor_per_policy =true;pdata.suspend = rockchip_cpufreq_suspend;/* 注册 cpufreq 通知链 */ret = cpufreq_register_notifier(&rockchip_cpufreq_notifier_block,CPUFREQ_POLICY_NOTIFIER);if(ret) {pr_err("failed to register cpufreq notifiern");gotorelease_cluster_info;}/* 如果存在 OPP 共享总线,注册 transition 通知器 */if(is_opp_shared_cpu_bus) {ret = cpufreq_register_notifier(&rockchip_cpufreq_transition_notifier_block,CPUFREQ_TRANSITION_NOTIFIER);// ...}/* 注册 panic 通知器,用于调试 */ret = atomic_notifier_chain_register(&panic_notifier_list,&rockchip_cpufreq_panic_notifier_block);/* 注册 platform_device,触发 cpufreq-dt 驱动 probe */returnPTR_ERR_OR_ZERO(platform_device_register_data(NULL,"cpufreq-dt",-1, (void*)&pdata,sizeof(structcpufreq_dt_platform_data)));}module_init(rockchip_cpufreq_driver_init);
3.2 Cluster初始化:rockchip_cpufreq_cluster_init
每个CPU集群(大核/小核)的初始化核心是解析OPP表、获取电压调节器、读取SoC特有信息:
Cstaticintrockchip_cpufreq_cluster_init(intcpu,structcluster_info *cluster){ structrockchip_opp_info*opp_info = &cluster->opp_info; structdevice_node*np; structdevice*dev; char*reg_name; intret =0; u32 freq =0; dev =get_cpu_device(cpu); if(!dev) return-ENODEV; /* 从 CPU 节点获取 operating-points-v2 phandle */ np =of_parse_phandle(dev->of_node,"operating-points-v2",0); if(!np) { dev_warn(dev,"OPP-v2 not supportedn"); return-ENOENT; } /* 获取共享 OPP 的 CPU 掩码(大核/小核 cluster) */ ret =dev_pm_opp_of_get_sharing_cpus(dev, &cluster->cpus); if(ret) { dev_err(dev,"Failed to get sharing cpusn"); of_node_put(np); returnret; } /* 检查是否共享 DSU/CCI 总线 OPP */ if(of_property_read_bool(np,"rockchip,opp-shared-dsu") || of_property_read_bool(np,"rockchip,opp-shared-cci")) cluster->is_opp_shared_cpu_bus =true; /* 读取 CPU 到总线频率的百分比 */ of_property_read_u32(np,"rockchip,cpu-freq-percent", &cluster->cpu_freq_percent); /* 读取空闲阈值频率 */ if(!of_property_read_u32(np,"rockchip,idle-threshold-freq", &freq)) cluster->idle_threshold_freq = freq; of_node_put(np); /* 确定 regulator 名称 */ if(of_find_property(dev->of_node,"cpu-supply",NULL)) reg_name ="cpu"; elseif(of_find_property(dev->of_node,"cpu0-supply",NULL)) reg_name ="cpu0"; else return-ENOENT; /* 获取 SoC 特定的 OPP 数据处理函数 */ rockchip_get_opp_data(rockchip_cpufreq_of_match, opp_info); /* 初始化 OPP 信息:读取 nvmem、设置 supported_hw */ ret =rockchip_init_opp_info(dev, opp_info,NULL, reg_name); if(ret) dev_err(dev,"failed to init opp infon"); returnret;}
3.3 RK3576特有的SoC信息读取
RK3576不同版本(M/J/S)的频率/电压支持不同,核心是从nvmem读取芯片规格:
staticintrk3576_cpu_get_soc_info(structdevice *dev,structdevice_node *np,int*bin,int*process){intret =0;u8 spec =0, test_version =0;if(!bin)return0;/* 从 nvmem 读取芯片规格序列号 */if(of_property_match_string(np,"nvmem-cell-names","specification_serial_number") >=0) {ret = rockchip_nvmem_cell_read_u8(np,"specification_serial_number",&spec);if(ret) {dev_err(dev,"Failed to get specification_serial_numbern");returnret;}}/* 读取测试版本 */if(of_property_match_string(np,"nvmem-cell-names","test_version") >=0) {ret = rockchip_nvmem_cell_read_u8(np,"test_version", &test_version);if(ret) {dev_err(dev,"Failed to get test_versionn");returnret;}}/* 根据 spec 值判断芯片型号 */if(spec ==0xd) {*bin =1; /* RK3576M */}elseif(spec ==0xa) {*bin =2; /* RK3576J */}elseif(spec ==0x13) {if(test_version ==0) {*bin =3; /* RK3576S */}else{*bin =0;dev_info(dev,"bin=%d (3)n", *bin);return0;}}if(*bin < 0)*bin =0;dev_info(dev,"bin=%dn", *bin);returnret;}
流程图:RK3576 CPUFreq初始化流程
st=>start: 模块加载(rockchip_cpufreq_driver_init)op1=>operation: 遍历所有CPUop2=>operation: 查找/创建cluster_infoop3=>operation: rockchip_cpufreq_cluster_initop4=>operation: 解析OPP-v2节点op5=>operation: 获取共享CPU掩码/总线信息op6=>operation: 读取SoC信息(nvmem)op7=>operation: 注册策略/转换通知链op8=>operation: 注册cpufreq-dt平台设备e=>end: 触发cpufreq-dt probe,初始化完成st->op1->op2->op3->op4->op5->op6->op7->op8->e
四、频率调整核心:从Governor到硬件
频率调整是cpufreq的核心流程,完整链路是:调速器决策→核心层处理→驱动层执行→ OPP子系统→硬件(时钟/电压)
4.1调速器决策:interactive为例
interactive是嵌入式场景最常用的调速器,核心是“按需快速升频,缓慢降频”,关键逻辑在负载计算和频率评估:
// drivers/cpufreq/cpufreq_interactive.c/* 核心数据结构 */structinteractive_cpu{structupdate_util_dataupdate_util; // 注册到调度器的钩子structinteractive_policy*ipolicy;structirq_workirq_work; // 中断上下文工作u64 last_sample_time;boolwork_in_progress;/* 负载计算相关 */spinlock_tload_lock;u64 time_in_idle;u64 time_in_idle_timestamp;u64 cputime_speedadj; // 加权 CPU 时间/* 频率控制 */spinlock_ttarget_freq_lock;unsignedinttarget_freq;unsignedintfloor_freq; // 最低允许频率u64 pol_floor_val_time; // 策略级 floor 时间u64 loc_floor_val_time; // CPU 级 floor 时间// ...};/* 调度器回调:每次 CPU 状态更新时调用 */staticvoiddbs_update_util_handler(structupdate_util_data *data, u64 time,unsignedintflags){structinteractive_cpu*icpu =container_of(data,structinteractive_cpu, update_util);structinteractive_policy*ipolicy = icpu->ipolicy;structinteractive_tunables*tunables = ipolicy->tunables;u64 delta_ns, lst;/* 检查是否可以更新 */if(!cpufreq_this_cpu_can_update(ipolicy->policy))return;/* 避免重复工作 */if(icpu->work_in_progress)return;/* 检查采样间隔 */lst =READ_ONCE(icpu->last_sample_time);delta_ns = time - lst;if((s64)delta_ns < tunables->sampling_rate * NSEC_PER_USEC)return;/* 提交 irq_work,在中断上下文执行 */icpu->last_sample_time = time;icpu->work_in_progress =true;irq_work_queue(&icpu->irq_work);}/* 实际频率评估函数 */staticvoideval_target_freq(structinteractive_cpu *icpu){structinteractive_tunables*tunables = icpu->ipolicy->tunables;structcpufreq_policy*policy = icpu->ipolicy->policy;u64 cputime_speedadj, now, max_fvtime;unsignedintnew_freq, loadadjfreq, delta_time;unsignedlongflags;intcpu_load;/* 计算 CPU 负载 */spin_lock_irqsave(&icpu->load_lock, flags);now =update_load(icpu,smp_processor_id());delta_time = (unsignedint)(now - icpu->cputime_speedadj_timestamp);cputime_speedadj = icpu->cputime_speedadj;spin_unlock_irqrestore(&icpu->load_lock, flags);if(!delta_time)return;/* 计算负载百分比 */cpu_load = (unsignedint)(100* cputime_speedadj / delta_time) / policy->cur;spin_lock_irqsave(&icpu->target_freq_lock, flags);/* 根据负载选择目标频率 */loadadjfreq = cpu_load * policy->cur;if(cpu_load >= tunables->go_hispeed_load) {/* 高负载:进入 hispeed_freq */if(policy->cur < tunables->hispeed_freq) {new_freq = tunables->hispeed_freq;}else{new_freq =choose_freq(icpu, loadadjfreq);/* 检查 above_hispeed_delay */if(now - max_fvtime < freq_to_above_hispeed_delay(tunables, new_freq))new_freq =max(new_freq, tunables->hispeed_freq);}}else{/* 低负载:按比例降频 */new_freq =choose_freq(icpu, loadadjfreq);}/* 应用 floor 约束 */if(new_freq < icpu->floor_freq) {if(now - icpu->pol_floor_val_time < tunables->min_sample_time)new_freq = icpu->floor_freq;}/* 限制在策略范围内 */new_freq =max(new_freq, policy->min);new_freq =min(new_freq, policy->max);/* 提交频率变更 */if(new_freq != policy->cur) {icpu->target_freq = new_freq;spin_lock(&speedchange_cpumask_lock);cpumask_set_cpu(smp_processor_id(), &speedchange_cpumask);spin_unlock(&speedchange_cpumask_lock);wake_up_process(speedchange_task); // 唤醒内核线程执行变更}spin_unlock_irqrestore(&icpu->target_freq_lock, flags);}
4.2频率选择算法:choose_freq
interactive调速器的核心算法,用二分查找思想找到满足负载的最低频率:
/** 选择满足目标负载的最低频率* 采用二分查找思想,在频率表中寻找最优解*/staticunsignedintchoose_freq(structinteractive_cpu *icpu,unsignedintloadadjfreq){structcpufreq_policy*policy = icpu->ipolicy->policy;structcpufreq_frequency_table*freq_table = policy->freq_table;unsignedintprevfreq, freqmin =0, freqmax = UINT_MAX, tl;unsignedintfreq = policy->cur;intindex;do{prevfreq = freq;/* 获取当前频率的目标负载 */tl =freq_to_targetload(icpu->ipolicy->tunables, freq);/** 查找满足 loadadjfreq / tl <= freq 的最低频率* 即:freq >= loadadjfreq / tl*/index =cpufreq_frequency_table_target(policy, loadadjfreq / tl,CPUFREQ_RELATION_L);freq = freq_table[index].frequency;if(freq > prevfreq) {/* 频率上升:记录最小值 */freqmin = prevfreq;if(freq >= freqmax) {/* 超过上限,回退 */index =cpufreq_frequency_table_target(policy, freqmax -1,CPUFREQ_RELATION_H);freq = freq_table[index].frequency;if(freq == freqmin)break;}}elseif(freq < prevfreq) {/* 频率下降:记录最大值 */freqmax = prevfreq;if(freq <= freqmin) {/* 低于下限,回退 */index =cpufreq_frequency_table_target(policy, freqmin +1,CPUFREQ_RELATION_L);freq = freq_table[index].frequency;if(freq == freqmax)break;}}}while(freq != prevfreq);returnfreq;}
4.3核心层频率切换:cpufreq_core
核心层负责频率切换的同步、通知和状态管理,保证线程安全:
// drivers/cpufreq/cpufreq.c/* 开始频率转换 */voidcpufreq_freq_transition_begin(structcpufreq_policy *policy,structcpufreq_freqs *freqs){/* 防止重复调用导致死锁 */WARN_ON(!(cpufreq_driver->flags & CPUFREQ_ASYNC_NOTIFICATION)&& current == policy->transition_task);wait:/* 等待前一次转换完成 */wait_event(policy->transition_wait, !policy->transition_ongoing);spin_lock(&policy->transition_lock);if(unlikely(policy->transition_ongoing)) {spin_unlock(&policy->transition_lock);gotowait;}policy->transition_ongoing =true;policy->transition_task = current;spin_unlock(&policy->transition_lock);/* 发送 PRECHANGE 通知 */cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE);}/* 频率转换通知 */staticvoidcpufreq_notify_transition(structcpufreq_policy *policy,structcpufreq_freqs *freqs,unsignedintstate){intcpu;freqs->policy = policy;freqs->flags = cpufreq_driver->flags;switch(state) {caseCPUFREQ_PRECHANGE:/* 同步 old frequency */if(policy->cur && policy->cur != freqs->old) {freqs->old = policy->cur;}/* 调用 transition notifier 链 */srcu_notifier_call_chain(&cpufreq_transition_notifier_list,CPUFREQ_PRECHANGE, freqs);/* 调整 loops_per_jiffy */adjust_jiffies(CPUFREQ_PRECHANGE, freqs);break;caseCPUFREQ_POSTCHANGE:adjust_jiffies(CPUFREQ_POSTCHANGE, freqs);/* 记录 tracepoint */for_each_cpu(cpu, policy->cpus)trace_cpu_frequency(freqs->new, cpu);/* 调用 transition notifier 链 */srcu_notifier_call_chain(&cpufreq_transition_notifier_list,CPUFREQ_POSTCHANGE, freqs);/* 更新统计 */cpufreq_stats_record_transition(policy, freqs->new);policy->cur = freqs->new;}}/* 结束频率转换 */voidcpufreq_freq_transition_end(structcpufreq_policy *policy,structcpufreq_freqs *freqs,inttransition_failed){cpufreq_notify_post_transition(policy, freqs, transition_failed);/* 更新频率缩放比例(用于调度器) */arch_set_freq_scale(policy->related_cpus,policy->cur,policy->cpuinfo.max_freq);spin_lock(&policy->transition_lock);policy->transition_ongoing =false;policy->transition_task =NULL;spin_unlock(&policy->transition_lock);wake_up(&policy->transition_wait);}
4.4驱动层实现:cpufreq-dt
DT驱动是通用层,对接核心层和平台层,核心是频率/电压的实际设置:
// drivers/cpufreq/cpufreq-dt.c/* 设置目标频率 */staticintset_target(structcpufreq_policy *policy,unsignedintindex){structprivate_data*priv = policy->driver_data;unsignedlongfreq = policy->freq_table[index].frequency;/* RK 平台使用特殊的 OPP 设置函数 */returnrockchip_cpufreq_opp_set_rate(priv->cpu_dev, freq *1000);returndev_pm_opp_set_rate(priv->cpu_dev, freq *1000);}/* 驱动初始化 */staticintcpufreq_init(structcpufreq_policy *policy){structprivate_data*priv;structdevice*cpu_dev;structclk*cpu_clk;unsignedinttransition_latency;intret;priv =cpufreq_dt_find_data(policy->cpu);if(!priv) {pr_err("failed to find data for cpu%dn", policy->cpu);return-ENODEV;}cpu_dev = priv->cpu_dev;/* 获取 CPU 时钟 */cpu_clk =clk_get(cpu_dev,NULL);if(IS_ERR(cpu_clk)) {ret =PTR_ERR(cpu_clk);dev_err(cpu_dev,"%s: failed to get clk: %dn", __func__, ret);returnret;}/* 获取最大转换延迟 */transition_latency =dev_pm_opp_get_max_transition_latency(cpu_dev);if(!transition_latency)transition_latency = CPUFREQ_ETERNAL;/* 填充 policy */cpumask_copy(policy->cpus, priv->cpus);policy->driver_data = priv;policy->clk = cpu_clk;policy->freq_table = priv->freq_table;policy->suspend_freq =dev_pm_opp_get_suspend_opp_freq(cpu_dev) /1000;policy->cpuinfo.transition_latency = transition_latency;policy->dvfs_possible_from_any_cpu =true;/* 支持 boost 模式 */if(policy_has_boost_freq(policy)) {ret =cpufreq_enable_boost_support();if(ret)gotoout_clk_put;cpufreq_dt_attr[1] = &cpufreq_freq_attr_scaling_boost_freqs;}return0;out_clk_put:clk_put(cpu_clk);returnret;}staticstructcpufreq_driverdt_cpufreq_driver = {.flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK |CPUFREQ_IS_COOLING_DEV,.verify = cpufreq_generic_frequency_table_verify,.target_index = set_target,.get = cpufreq_generic_get,.init = cpufreq_init,.exit = cpufreq_exit,.online = cpufreq_online,.offline = cpufreq_offline,.register_em = cpufreq_register_em_with_opp,.name ="cpufreq-dt",.attr = cpufreq_dt_attr,.suspend = cpufreq_generic_suspend,};
4.5 RK平台特殊处理:rockchip_cpufreq
RK3576的定制化逻辑,包括DVFS锁、Read Margin、多路电压调节器:
// drivers/cpufreq/rockchip-cpufreq.c/* RK 平台频率设置入口 */introckchip_cpufreq_opp_set_rate(structdevice *dev,unsignedlongtarget_freq){structcluster_info*cluster;structdev_pm_opp*opp;structrockchip_opp_info*opp_info;structdev_pm_opp_supplysupplies[2] = {0};unsignedlongfreq;intret =0;cluster =rockchip_cluster_info_lookup(dev->id);if(!cluster)return-EINVAL;opp_info = &cluster->opp_info;/* 获取 DVFS 锁,防止并发修改 */rockchip_opp_dvfs_lock(opp_info);/* 调用 OPP 子系统设置频率 */ret =dev_pm_opp_set_rate(dev, target_freq);if(!ret) {cluster->rate = freq = target_freq;/* 查找当前 OPP,获取电压信息 */opp =dev_pm_opp_find_freq_ceil(dev, &freq);if(!IS_ERR(opp)) {dev_pm_opp_get_supplies(opp, supplies);cluster->volt = supplies[0].u_volt;if(opp_info->regulator_count >1)cluster->mem_volt = supplies[1].u_volt;dev_pm_opp_put(opp);}}rockchip_opp_dvfs_unlock(opp_info);returnret;}EXPORT_SYMBOL_GPL(rockchip_cpufreq_opp_set_rate);/* RK3576 Read Margin 设置 */staticintrk3576_cpu_set_read_margin(structdevice *dev,structrockchip_opp_info *opp_info,u32 rm){if(!opp_info->volt_rm_tbl)return0;if(rm == opp_info->current_rm || rm == UINT_MAX)return0;dev_dbg(dev,"set rm to %dn", rm);/* 通过 GRF 配置 Read Margin */if(opp_info->grf) {/* CPU0-3 核心 */regmap_write(opp_info->grf,0x3c,0x001c0000| (rm << 2));regmap_write(opp_info->grf,0x44,0x001c0000| (rm << 2));/* 触发更新 */regmap_write(opp_info->grf,0x38,0x00020002);udelay(1);regmap_write(opp_info->grf,0x38,0x00020000);}/* CCI 接口 */if(opp_info->cci_grf)regmap_write(opp_info->cci_grf,0x54,0x001c0000| (rm << 2));opp_info->current_rm = rm;return0;}/* 多路 regulator 配置 */staticintcpu_opp_config_regulators(structdevice *dev,structdev_pm_opp *old_opp,structdev_pm_opp *new_opp,structregulator **regulators,unsignedintcount){structcluster_info*cluster;cluster =rockchip_cluster_info_lookup(dev->id);if(!cluster)return-EINVAL;returnrockchip_opp_config_regulators(dev, old_opp, new_opp, regulators,count, &cluster->opp_info);}
流程图:RK3576频率调整完整流程
st=>start: 调度器触发负载更新op1=>operation: interactive调速器(eval_target_freq)op2=>operation: 计算CPU负载,选择目标频率op3=>operation: 核心层(cpufreq_freq_transition_begin)op4=>operation: 发送PRECHANGE通知op5=>operation: 驱动层(set_target)op6=>operation: RK平台(rockchip_cpufreq_opp_set_rate)op7=>operation: OPP子系统(dev_pm_opp_set_rate)op8=>operation: 升频:先升压后升频;降频:先降频后降压op9=>operation: 核心层(cpufreq_freq_transition_end)op10=>operation: 发送POSTCHANGE通知,更新统计e=>end: 频率切换完成st->op1->op2->op3->op4->op5->op6->op7->op8->op9->op10->e
五、OPP子系统:频率-电压表的奥秘
OPP(Operating Performance Point)是连接软件和硬件的关键,定义了“频率-电压”的映射关系,是DVFS的基础。
5.1 OPP数据结构
// include/linux/pm_opp.hstructdev_pm_opp{structlist_headnode;unsignedlongrate; // 频率 (Hz)unsignedlongu_volt; // 电压 (uV)unsignedlongu_volt_min; // 最小电压unsignedlongu_volt_max; // 最大电压structdevice_opp*dev_opp; // 所属设备/* 支持的条件 */unsignedlongsupported_hw; // 硬件版本掩码/* 供电信息 */structopp_supply*supplies;unsignedintsupply_count;/* 自定义数据 */void*priv;};structdevice_opp{structlist_headnode;structdevice*dev;structsrcu_notifier_headsrcu_head;structlist_headopp_list;structclk*clk;structregulator**regulators;unsignedintregulator_count;structopp_table*opp_table;};
5.2 Device Tree OPP定义
RK3576的OPP表在DTS中定义,不同芯片版本支持不同频率:
// arch/arm64/boot/dts/rockchip/rk3576.dtsicpu0_opp_table:opp-table-0{compatible ="operating-points-v2";opp-shared;/* 408 MHz */opp-408000000{opp-hz =/bits/64<408000000>;opp-microvolt = <800000>;clock-latency-ns = <40000>;};/* 600 MHz */opp-600000000{opp-hz =/bits/64<600000000>;opp-microvolt = <825000>;};/* 816 MHz */opp-816000000{opp-hz =/bits/64<816000000>;opp-microvolt = <850000>;};/* 1.2 GHz */opp-1200000000{opp-hz =/bits/64<1200000000>;opp-microvolt = <925000>;};/* 1.608 GHz - 仅支持特定芯片版本 */opp-1608000000{opp-hz =/bits/64<1608000000>;opp-microvolt = <1100000>;opp-supported-hw = <0x10x1>; //bin=0, volt_sel=0};/* 1.8 GHz - 更高规格芯片 */opp-1800000000{opp-hz =/bits/64<1800000000>;opp-microvolt = <1175000>;opp-supported-hw = <0x10x3>; //bin=0, volt_sel=0,1,2};};
5.3 OPP查找与设置流程
OPP子系统的核心函数,负责频率/电压的实际设置:
// drivers/opp/core.c/*** dev_pm_opp_set_rate() - 设置设备到指定频率* @dev: 设备* @target_freq: 目标频率 (Hz)** 1. 查找匹配的 OPP* 2. 设置 regulator 电压* 3. 设置时钟频率*/intdev_pm_opp_set_rate(structdevice *dev,unsignedlongtarget_freq){structdevice_opp*dev_opp;structdev_pm_opp*opp;structclk*clk;unsignedlongold_freq, new_freq;intret;dev_opp = _find_device_opp(dev);if(IS_ERR(dev_opp))returnPTR_ERR(dev_opp);clk = dev_opp->clk;old_freq =clk_get_rate(clk);/* 查找目标频率对应的 OPP */opp = _find_freq_ceil(dev_opp, &target_freq);if(IS_ERR(opp)) {ret =PTR_ERR(opp);gotoput_opp;}new_freq = opp->rate;/* 如果频率相同,只更新电压 */if(new_freq == old_freq) {ret = _set_opp_voltage(dev, dev_opp, opp);gotoput_opp;}/* 升频:先升压,后升频 */if(new_freq > old_freq) {ret = _set_opp_voltage(dev, dev_opp, opp);if(ret)gotoput_opp;ret =clk_set_rate(clk, new_freq);if(ret) {/* 回滚电压 */_set_opp_voltage(dev, dev_opp, _find_freq_floor(dev_opp, &old_freq));}}else{/* 降频:先降频,后降压 */ret =clk_set_rate(clk, new_freq);if(ret)gotoput_opp;_set_opp_voltage(dev, dev_opp, opp);}put_opp:dev_pm_opp_put(opp);returnret;}
流程图:OPP频率-电压设置流程
st=>start: 调用dev_pm_opp_set_rateop1=>operation: 查找设备对应的device_oppop2=>operation: 获取当前时钟频率(old_freq)op3=>operation: 查找目标频率的OPP(ceil)op4=>operation: 判断new_freq == old_freq?op5=>operation: 仅更新电压(_set_opp_voltage)op6=>operation: new_freq > old_freq?op7=>operation: 先升压,后升频op8=>operation: 升频失败,回滚电压op9=>operation: 先降频,后降压e=>end: 返回设置结果st->op1->op2->op3->op4op4(yes)->op5->eop4(no)->op6op6(yes)->op7->eop7(no)->op8->eop6(no)->op9->e
六、通知链机制:频率变更的广播系统
cpufreq的通知链是“事件广播”机制,允许其他子系统(如温控、功耗管理)监听频率变更事件,是内核模块化设计的典型体现。
6.1两种通知链
// drivers/cpufreq/cpufreq.c/* 策略通知链:策略创建/销毁时调用 */staticBLOCKING_NOTIFIER_HEAD(cpufreq_policy_notifier_list);/* 转换通知链:频率变更前后调用 */SRCU_NOTIFIER_HEAD_STATIC(cpufreq_transition_notifier_list);
6.2 RK平台的通知器注册
RK3576通过通知链实现“频率-空闲状态”联动、总线QoS约束:
// drivers/cpufreq/rockchip-cpufreq.c/* 策略通知器:处理监控注册和总线 QoS */staticintrockchip_cpufreq_notifier(structnotifier_block *nb,unsignedlongevent,void*data){structcpufreq_policy *policy = data;structcluster_info *cluster;cluster = rockchip_cluster_info_lookup(policy->cpu);if(!cluster)returnNOTIFY_BAD;switch(event) {caseCPUFREQ_CREATE_POLICY:/* 注册系统监控 */if(rockchip_cpufreq_add_monitor(cluster, policy))returnNOTIFY_BAD;/* 添加总线频率 QoS 约束 */if(rockchip_cpufreq_add_bus_qos_req(cluster, policy))returnNOTIFY_BAD;break;caseCPUFREQ_REMOVE_POLICY:rockchip_cpufreq_remove_monitor(cluster);rockchip_cpufreq_remove_bus_qos(cluster);break;}returnNOTIFY_OK;}/* 转换通知器:处理空闲状态 */staticintrockchip_cpufreq_transition_notifier(structnotifier_block *nb,unsignedlongevent,void*data){structcpufreq_freqs *freqs = data;structcpufreq_policy *policy = freqs->policy;structcluster_info *cluster;cluster = rockchip_cluster_info_lookup(policy->cpu);if(!cluster)returnNOTIFY_BAD;switch(event) {caseCPUFREQ_PRECHANGE:/* 高频时禁用深层空闲状态 */if(cluster->idle_threshold_freq &&freqs->new>= cluster->idle_threshold_freq &&!cluster->is_idle_disabled) {rockchip_cpufreq_idle_state_disable(policy->cpus,1,true);cluster->is_idle_disabled =true;}break;caseCPUFREQ_POSTCHANGE:/* 低频时重新启用空闲状态 */if(cluster->idle_threshold_freq &&freqs->new< cluster->idle_threshold_freq &&cluster->is_idle_disabled) {rockchip_cpufreq_idle_state_disable(policy->cpus,1,false);cluster->is_idle_disabled =false;}/* 更新总线频率请求 */rockchip_cpufreq_update_bus_req(cluster, freqs->new);break;}returnNOTIFY_OK;}
6.3使用场景示例
温控驱动通过通知链监听频率变更,更新热模型:
/* 温度管理驱动注册通知器 */staticintthermal_cpufreq_notifier(structnotifier_block *nb,unsignedlongevent,void*data){structcpufreq_freqs *freqs = data;if(event== CPUFREQ_POSTCHANGE) {/* 频率变更后更新热模型 */update_thermal_model(freqs->new);}returnNOTIFY_OK;}staticstructnotifier_block thermal_nb = {.notifier_call = thermal_cpufreq_notifier,};/* 注册 */cpufreq_register_notifier(&thermal_nb, CPUFREQ_TRANSITION_NOTIFIER);
七、统计与调试:洞察系统行为
掌握调试技巧,才能快速定位频率调节的问题,cpufreq提供了丰富的统计和调试接口。
7.1统计模块实现
// drivers/cpufreq/cpufreq_stats.cstructcpufreq_stats{unsignedinttotal_trans; // 总切换次数unsignedlonglonglast_time; // 上次更新时间unsignedintmax_state; // 最大状态数unsignedintstate_num; // 实际状态数unsignedintlast_index; // 当前状态索引u64 *time_in_state; // 各频率驻留时间unsignedint*freq_table; // 频率表unsignedint*trans_table; // 切换矩阵};/* 记录一次频率切换 */voidcpufreq_stats_record_transition(structcpufreq_policy *policy,unsignedintnew_freq){structcpufreq_stats*stats = policy->stats;intold_index, new_index;if(!stats)return;old_index = stats->last_index;new_index =freq_table_get_index(stats, new_freq);if(old_index ==-1|| new_index ==-1|| old_index == new_index)return;/* 更新驻留时间 */cpufreq_stats_update(stats, stats->last_time);stats->last_index = new_index;stats->trans_table[old_index * stats->max_state + new_index]++;stats->total_trans++;}
7.2调试技巧
# 1. 查看当前频率和策略cat/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freqcat/sys/devices/system/cpu/cpu0/cpufreq/scaling_governorcat/sys/devices/system/cpu/cpu0/cpufreq/scaling_min_freqcat/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq# 2. 查看频率驻留时间统计cat/sys/devices/system/cpu/cpu0/cpufreq/stats/time_in_state# 输出格式: 频率(Hz) 时间(时钟周期)# 408000 1234567# 600000 2345678# ...# 3. 查看频率切换矩阵cat/sys/devices/system/cpu/cpu0/cpufreq/stats/trans_table# 显示从每个频率切换到其他频率的次数# 4. 查看可用频率和调速器cat/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequenciescat/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_governors# 5. 动态切换调速器echouserspace > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governorecho1200000 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed# 6. 内核日志调试dmesg | grep -i cpufreqdmesg | grep -i"rk3576|rockchip"# 7. 使用 tracepointcd/sys/kernel/debug/tracingecho1 > events/cpufreq/enablecattrace
八、实战:自定义调速器开发
掌握自定义调速器的开发,才能适配特定场景的性能/功耗需求。
8.1最小调速器框架
staticintmygov_target(structcpufreq_policy *policy,unsignedinttarget_freq,unsignedintrelation){/* 简单的频率设置 */return__cpufreq_driver_target(policy, target_freq, relation);}staticintmygov_init(structcpufreq_policy *policy){/* 初始化 */return0;}staticvoidmygov_exit(structcpufreq_policy *policy){/* 清理 */}staticstructcpufreq_governormy_governor = {.name ="mygov",.target = mygov_target,.init = mygov_init,.exit = mygov_exit,.owner = THIS_MODULE,};staticint__initmygov_init_module(void){returncpufreq_register_governor(&my_governor);}staticvoid__exitmygov_exit_module(void){cpufreq_unregister_governor(&my_governor);}module_init(mygov_init_module);module_exit(mygov_exit_module);MODULE_LICENSE("GPL");
8.2基于负载的调速器
staticvoidmygov_update(structcpufreq_policy *policy){unsignedintload =calculate_cpu_load(policy);unsignedintnew_freq;if(load >80) {/* 高负载:升频到最大 */new_freq = policy->max;}elseif(load < 20) {/* 低负载:降频到最小 */new_freq = policy->min;}else{/* 中等负载:线性插值 */new_freq = policy->min + load * (policy->max - policy->min) /100;}__cpufreq_driver_target(policy, new_freq, CPUFREQ_RELATION_H);}
脑图:自定义调速器开发
九、性能优化实战
针对实际开发中的痛点,优化频率切换延迟、负载计算精度。
9.1减少频率切换延迟
/* 1. 使用 fast frequency switching */staticstructcpufreq_drivermy_driver={.flags=CPUFREQ_FAST_SWITCHING,.fast_switch=my_fast_switch, // 原子上下文切换};/* 2. 减少 transition latency */// 在 OPP 表中设置较小的 clock-latency-nsopp-1200000000{opp-hz=/bits/64<1200000000>;clock-latency-ns=<10000>; // 10us,而不是 40us};
9.2优化负载计算
/* 使用 PELT 信号代替 idle time */staticunsignedintget_pelt_load(structcpufreq_policy *policy){unsignedintcpu = policy->cpu;structrq*rq =cpu_rq(cpu);/* PELT (Per-Entity Load Tracking) 是调度器内部的负载跟踪机制 */returnrq->cfs.avg.util_avg; // 0 ~ 1024}
脑图:性能优化实战
十、总结与延伸
10.1核心要点回顾
1.分层架构:governor → core → driver → OPP → clk/regulator
2.线程安全:transition_lock、DVFS lock、notifier chain
3.硬件抽象:OPP表统一频率-电压关系
4.平台特性:RK3576的Read Margin、多路regulator
10.2延伸阅读
•Documentation/cpu-freq/-内核文档
•drivers/opp/- OPP子系统实现
•drivers/cpufreq/cpufreq_schedutil.c-最新schedutil调速器
•include/linux/cpufreq.h-完整API定义
10.3调试checklist
□dmesg | grep cpufreq查看初始化日志
□cat scaling_available_frequencies确认频率表
□cat time_in_state确认频率切换正常
□trace-cmd record -e cpufreq抓取切换事件
□检查regulator是否支持动态电压调节
写在最后
cpufreq是Linux内核中最接近硬件的子系统之一,理解它不仅能帮你优化系统性能,更能深入理解内核的设备模型、电源管理和并发控制机制。







