RK3576 CPUFreq驱动深度剖析:从原理到实战

域垚达人 保险理财 2026-04-10 2352 0

嵌入式Linux性能调优核心战场,从代码层面彻底吃透DVFS机制

作为嵌入式Linux开发者,你是否只停留在echo performance > scaling_governor的表层操作?当遇到频率切换延迟高、电压调节失败、多核异构频率协同等问题时,是否束手无策?

今天这篇文章,将逐行剖析RK3576平台的cpufreq实现,从源码层面拆解Linux内核DVFS(动态电压频率调节)的每一个细节,带你从会用精通,解决实际开发中的核心痛点。

一、为什么要深入cpufreq源码?

嵌入式Linux开发中,cpufreq是性能与功耗平衡的核心,但这些问题往往让开发者卡壳:

频率切换延迟过高,如何优化?

电压调节失败,如何定位?

多核异构场景下,频率如何协同?

自定义调速器,如何接入框架?

只有深入源码,才能真正掌握DVFS的底层逻辑,而非停留在调参数的表面操作。

二、整体架构:代码视角的分层设计

2.1文件组织与模块依赖

cpufreq驱动的代码分布清晰,核心文件各司其职:

drivers/cpufreq/├── cpufreq.c # 核心框架:策略管理、通知链、sysfs├── cpufreq-dt.c # DT 通用驱动:OPP 解析、时钟操作├── cpufreq-dt.h # DT 驱动接口定义├── rockchip-cpufreq.c # RK 平台驱动:SoC 特性、DVFS 锁、监控├── rockchip-cpufreq.h # RK 驱动导出接口├── cpufreq_governor.c # 调速器公共框架├── cpufreq_governor.h # 调速器数据结构├── cpufreq_interactive.c # Interactive 调速器实现├── cpufreq_ondemand.c # Ondemand 调速器实现├── freq_table.c # 频率表辅助函数└── cpufreq_stats.c # 统计信息模块

2.2核心结构体关系(代码定义)

cpufreq的核心逻辑围绕两个结构体展开,是理解整个框架的关键:

// include/linux/cpufreq.hstructcpufreq_policy{ cpumask_var_t cpus; // 本策略管理的 CPU 掩码 cpumask_var_t related_cpus; // 硬件相关的 CPU(共享时钟) unsignedint min; // 用户设置的最小频率 unsignedint max; // 用户设置的最大频率 unsignedint cur; // 当前频率(kHz) structcpufreq_governor*governor; // 当前调速器 void *governor_data;// 调速器私有数据 structcpufreq_frequency_table*freq_table;// 频率表 structcpufreq_stats *stats; // 统计信息 structkobject kobj; // sysfs 对象 void *driver_data; // 驱动私有数据 // ... 更多字段};structcpufreq_driver{ char name[CPUFREQ_NAME_LEN]; unsignedint flags;
 /* 必须实现的回调 */ int(*target_index)(structcpufreq_policy *policy,unsignedintindex); unsignedint(*get)(unsignedintcpu);
 /* 可选回调 */ int(*init)(structcpufreq_policy *policy); int(*exit)(structcpufreq_policy *policy); int(*online)(structcpufreq_policy *policy); int(*offline)(structcpufreq_policy *policy); // ...};

脑图:RK3576 CPUFreq核心架构

wKgZPGnRtT2AGcPQAAGnAiZaL38464.png

三、初始化流程:从模块加载到频率就绪

RK3576cpufreq初始化是先集群初始化,再注册驱动的逻辑,核心分为三步:

3.1驱动入口:rockchip_cpufreq_driver_init

这是整个RK平台cpufreq的入口函数,核心逻辑是遍历CPU集群、初始化集群信息、注册通知链和平台设备:

// drivers/cpufreq/rockchip-cpufreq.cstaticint__initrockchip_cpufreq_driver_init(void){ structcluster_info *cluster, *pos; structcpufreq_dt_platform_data pdata = {0}; intcpu, ret; boolis_opp_shared_cpu_bus =false; /* 遍历所有可能的 CPU,为每个 cluster 初始化 */ for_each_possible_cpu(cpu) { cluster = rockchip_cluster_info_lookup(cpu); if(cluster) continue; // 已初始化过 cluster = kzalloc(sizeof(*cluster), GFP_KERNEL); if(!cluster) { ret = -ENOMEM; gotorelease_cluster_info; } /* 核心初始化:解析 OPP、获取 regulator、设置 SoC 信息 */ ret = rockchip_cpufreq_cluster_init(cpu, cluster); if(ret) { pr_err("Failed to initialize dvfs info cpu%dn", cpu); gotorelease_cluster_info; }
 list_add(&cluster->list_head, &cluster_info_list); if(cluster->is_opp_shared_cpu_bus) is_opp_shared_cpu_bus =true; } /* 设置平台数据 */ pdata.have_governor_per_policy =true; pdata.suspend = rockchip_cpufreq_suspend; /* 注册 cpufreq 通知链 */ ret = cpufreq_register_notifier(&rockchip_cpufreq_notifier_block, CPUFREQ_POLICY_NOTIFIER); if(ret) { pr_err("failed to register cpufreq notifiern"); gotorelease_cluster_info; } /* 如果存在 OPP 共享总线,注册 transition 通知器 */ if(is_opp_shared_cpu_bus) { ret = cpufreq_register_notifier(&rockchip_cpufreq_transition_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); // ... } /* 注册 panic 通知器,用于调试 */ ret = atomic_notifier_chain_register(&panic_notifier_list, &rockchip_cpufreq_panic_notifier_block); /* 注册 platform_device,触发 cpufreq-dt 驱动 probe */ returnPTR_ERR_OR_ZERO(platform_device_register_data(NULL,"cpufreq-dt", -1, (void*)&pdata,sizeof(structcpufreq_dt_platform_data)));}module_init(rockchip_cpufreq_driver_init);// drivers/cpufreq/rockchip-cpufreq.cstaticint__initrockchip_cpufreq_driver_init(void){ structcluster_info *cluster, *pos; structcpufreq_dt_platform_data pdata = {0}; intcpu, ret; boolis_opp_shared_cpu_bus =false; /* 遍历所有可能的 CPU,为每个 cluster 初始化 */ for_each_possible_cpu(cpu) { cluster = rockchip_cluster_info_lookup(cpu); if(cluster) continue; // 已初始化过 cluster = kzalloc(sizeof(*cluster), GFP_KERNEL); if(!cluster) { ret = -ENOMEM; gotorelease_cluster_info; } /* 核心初始化:解析 OPP、获取 regulator、设置 SoC 信息 */ ret = rockchip_cpufreq_cluster_init(cpu, cluster); if(ret) { pr_err("Failed to initialize dvfs info cpu%dn", cpu); gotorelease_cluster_info; }
 list_add(&cluster->list_head, &cluster_info_list); if(cluster->is_opp_shared_cpu_bus) is_opp_shared_cpu_bus =true; } /* 设置平台数据 */ pdata.have_governor_per_policy =true; pdata.suspend = rockchip_cpufreq_suspend; /* 注册 cpufreq 通知链 */ ret = cpufreq_register_notifier(&rockchip_cpufreq_notifier_block, CPUFREQ_POLICY_NOTIFIER); if(ret) { pr_err("failed to register cpufreq notifiern"); gotorelease_cluster_info; } /* 如果存在 OPP 共享总线,注册 transition 通知器 */ if(is_opp_shared_cpu_bus) { ret = cpufreq_register_notifier(&rockchip_cpufreq_transition_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); // ... } /* 注册 panic 通知器,用于调试 */ ret = atomic_notifier_chain_register(&panic_notifier_list, &rockchip_cpufreq_panic_notifier_block); /* 注册 platform_device,触发 cpufreq-dt 驱动 probe */ returnPTR_ERR_OR_ZERO(platform_device_register_data(NULL,"cpufreq-dt", -1, (void*)&pdata,sizeof(structcpufreq_dt_platform_data)));}module_init(rockchip_cpufreq_driver_init);

3.2 Cluster初始化:rockchip_cpufreq_cluster_init

每个CPU集群(大核/小核)的初始化核心是解析OPP表、获取电压调节器、读取SoC特有信息:

Cstaticintrockchip_cpufreq_cluster_init(intcpu,structcluster_info *cluster){ structrockchip_opp_info*opp_info = &cluster->opp_info; structdevice_node*np; structdevice*dev; char*reg_name; intret =0; u32 freq =0; dev =get_cpu_device(cpu); if(!dev) return-ENODEV; /* 从 CPU 节点获取 operating-points-v2 phandle */ np =of_parse_phandle(dev->of_node,"operating-points-v2",0); if(!np) { dev_warn(dev,"OPP-v2 not supportedn"); return-ENOENT; } /* 获取共享 OPP 的 CPU 掩码(大核/小核 cluster) */ ret =dev_pm_opp_of_get_sharing_cpus(dev, &cluster->cpus); if(ret) { dev_err(dev,"Failed to get sharing cpusn"); of_node_put(np); returnret; } /* 检查是否共享 DSU/CCI 总线 OPP */ if(of_property_read_bool(np,"rockchip,opp-shared-dsu") || of_property_read_bool(np,"rockchip,opp-shared-cci")) cluster->is_opp_shared_cpu_bus =true; /* 读取 CPU 到总线频率的百分比 */ of_property_read_u32(np,"rockchip,cpu-freq-percent", &cluster->cpu_freq_percent); /* 读取空闲阈值频率 */ if(!of_property_read_u32(np,"rockchip,idle-threshold-freq", &freq)) cluster->idle_threshold_freq = freq; of_node_put(np); /* 确定 regulator 名称 */ if(of_find_property(dev->of_node,"cpu-supply",NULL)) reg_name ="cpu"; elseif(of_find_property(dev->of_node,"cpu0-supply",NULL)) reg_name ="cpu0"; else return-ENOENT; /* 获取 SoC 特定的 OPP 数据处理函数 */ rockchip_get_opp_data(rockchip_cpufreq_of_match, opp_info); /* 初始化 OPP 信息:读取 nvmem、设置 supported_hw */ ret =rockchip_init_opp_info(dev, opp_info,NULL, reg_name); if(ret) dev_err(dev,"failed to init opp infon"); returnret;}

3.3 RK3576特有的SoC信息读取

RK3576不同版本(M/J/S)的频率/电压支持不同,核心是从nvmem读取芯片规格:

staticintrk3576_cpu_get_soc_info(structdevice *dev,structdevice_node *np, int*bin,int*process){ intret =0; u8 spec =0, test_version =0; if(!bin) return0; /* 从 nvmem 读取芯片规格序列号 */ if(of_property_match_string(np,"nvmem-cell-names", "specification_serial_number") >=0) { ret = rockchip_nvmem_cell_read_u8(np, "specification_serial_number", &spec); if(ret) { dev_err(dev,"Failed to get specification_serial_numbern"); returnret; } } /* 读取测试版本 */ if(of_property_match_string(np,"nvmem-cell-names","test_version") >=0) { ret = rockchip_nvmem_cell_read_u8(np,"test_version", &test_version); if(ret) { dev_err(dev,"Failed to get test_versionn"); returnret; } } /* 根据 spec 值判断芯片型号 */ if(spec ==0xd) { *bin =1; /* RK3576M */ }elseif(spec ==0xa) { *bin =2; /* RK3576J */ }elseif(spec ==0x13) { if(test_version ==0) { *bin =3; /* RK3576S */ }else{ *bin =0; dev_info(dev,"bin=%d (3)n", *bin); return0; } } if(*bin < 0) *bin =0; dev_info(dev,"bin=%dn", *bin); returnret;}

流程图:RK3576 CPUFreq初始化流程

st=>start: 模块加载(rockchip_cpufreq_driver_init)op1=>operation: 遍历所有CPUop2=>operation: 查找/创建cluster_infoop3=>operation: rockchip_cpufreq_cluster_initop4=>operation: 解析OPP-v2节点op5=>operation: 获取共享CPU掩码/总线信息op6=>operation: 读取SoC信息(nvmem)op7=>operation: 注册策略/转换通知链op8=>operation: 注册cpufreq-dt平台设备e=>end: 触发cpufreq-dt probe,初始化完成st->op1->op2->op3->op4->op5->op6->op7->op8->e

四、频率调整核心:从Governor到硬件

频率调整是cpufreq的核心流程,完整链路是:调速器决策核心层处理驱动层执行→ OPP子系统硬件(时钟/电压)

4.1调速器决策:interactive为例

interactive是嵌入式场景最常用的调速器,核心是按需快速升频,缓慢降频,关键逻辑在负载计算和频率评估:

// drivers/cpufreq/cpufreq_interactive.c/* 核心数据结构 */structinteractive_cpu{ structupdate_util_dataupdate_util; // 注册到调度器的钩子 structinteractive_policy*ipolicy;
 structirq_workirq_work; // 中断上下文工作 u64 last_sample_time; boolwork_in_progress;
 /* 负载计算相关 */ spinlock_tload_lock; u64 time_in_idle; u64 time_in_idle_timestamp; u64 cputime_speedadj; // 加权 CPU 时间
 /* 频率控制 */ spinlock_ttarget_freq_lock; unsignedinttarget_freq; unsignedintfloor_freq; // 最低允许频率 u64 pol_floor_val_time; // 策略级 floor 时间 u64 loc_floor_val_time; // CPU 级 floor 时间 // ...};/* 调度器回调:每次 CPU 状态更新时调用 */staticvoiddbs_update_util_handler(structupdate_util_data *data, u64 time, unsignedintflags){ structinteractive_cpu*icpu =container_of(data,structinteractive_cpu, update_util); structinteractive_policy*ipolicy = icpu->ipolicy; structinteractive_tunables*tunables = ipolicy->tunables; u64 delta_ns, lst; /* 检查是否可以更新 */ if(!cpufreq_this_cpu_can_update(ipolicy->policy)) return; /* 避免重复工作 */ if(icpu->work_in_progress) return; /* 检查采样间隔 */ lst =READ_ONCE(icpu->last_sample_time); delta_ns = time - lst; if((s64)delta_ns < tunables->sampling_rate * NSEC_PER_USEC) return; /* 提交 irq_work,在中断上下文执行 */ icpu->last_sample_time = time; icpu->work_in_progress =true; irq_work_queue(&icpu->irq_work);}/* 实际频率评估函数 */staticvoideval_target_freq(structinteractive_cpu *icpu){ structinteractive_tunables*tunables = icpu->ipolicy->tunables; structcpufreq_policy*policy = icpu->ipolicy->policy; u64 cputime_speedadj, now, max_fvtime; unsignedintnew_freq, loadadjfreq, delta_time; unsignedlongflags; intcpu_load; /* 计算 CPU 负载 */ spin_lock_irqsave(&icpu->load_lock, flags); now =update_load(icpu,smp_processor_id()); delta_time = (unsignedint)(now - icpu->cputime_speedadj_timestamp); cputime_speedadj = icpu->cputime_speedadj; spin_unlock_irqrestore(&icpu->load_lock, flags); if(!delta_time) return; /* 计算负载百分比 */ cpu_load = (unsignedint)(100* cputime_speedadj / delta_time) / policy->cur; spin_lock_irqsave(&icpu->target_freq_lock, flags); /* 根据负载选择目标频率 */ loadadjfreq = cpu_load * policy->cur;
 if(cpu_load >= tunables->go_hispeed_load) { /* 高负载:进入 hispeed_freq */ if(policy->cur < tunables->hispeed_freq) { new_freq = tunables->hispeed_freq; }else{ new_freq =choose_freq(icpu, loadadjfreq);
 /* 检查 above_hispeed_delay */ if(now - max_fvtime < freq_to_above_hispeed_delay(tunables, new_freq)) new_freq =max(new_freq, tunables->hispeed_freq); } }else{ /* 低负载:按比例降频 */ new_freq =choose_freq(icpu, loadadjfreq); } /* 应用 floor 约束 */ if(new_freq < icpu->floor_freq) { if(now - icpu->pol_floor_val_time < tunables->min_sample_time) new_freq = icpu->floor_freq; } /* 限制在策略范围内 */ new_freq =max(new_freq, policy->min); new_freq =min(new_freq, policy->max); /* 提交频率变更 */ if(new_freq != policy->cur) { icpu->target_freq = new_freq; spin_lock(&speedchange_cpumask_lock); cpumask_set_cpu(smp_processor_id(), &speedchange_cpumask); spin_unlock(&speedchange_cpumask_lock); wake_up_process(speedchange_task); // 唤醒内核线程执行变更 } spin_unlock_irqrestore(&icpu->target_freq_lock, flags);}

4.2频率选择算法choose_freq

interactive调速器的核心算法,用二分查找思想找到满足负载的最低频率:

/** 选择满足目标负载的最低频率* 采用二分查找思想,在频率表中寻找最优解*/staticunsignedintchoose_freq(structinteractive_cpu *icpu, unsignedintloadadjfreq){ structcpufreq_policy*policy = icpu->ipolicy->policy; structcpufreq_frequency_table*freq_table = policy->freq_table; unsignedintprevfreq, freqmin =0, freqmax = UINT_MAX, tl; unsignedintfreq = policy->cur; intindex; do{ prevfreq = freq;
 /* 获取当前频率的目标负载 */ tl =freq_to_targetload(icpu->ipolicy->tunables, freq); /* * 查找满足 loadadjfreq / tl <= freq 的最低频率 * 即:freq >= loadadjfreq / tl */ index =cpufreq_frequency_table_target(policy, loadadjfreq / tl, CPUFREQ_RELATION_L); freq = freq_table[index].frequency; if(freq > prevfreq) { /* 频率上升:记录最小值 */ freqmin = prevfreq; if(freq >= freqmax) { /* 超过上限,回退 */ index =cpufreq_frequency_table_target(policy, freqmax -1, CPUFREQ_RELATION_H); freq = freq_table[index].frequency; if(freq == freqmin) break; } }elseif(freq < prevfreq) { /* 频率下降:记录最大值 */ freqmax = prevfreq; if(freq <= freqmin) { /* 低于下限,回退 */ index =cpufreq_frequency_table_target(policy, freqmin +1, CPUFREQ_RELATION_L); freq = freq_table[index].frequency; if(freq == freqmax) break; } } }while(freq != prevfreq); returnfreq;}

4.3核心层频率切换:cpufreq_core

核心层负责频率切换的同步、通知和状态管理,保证线程安全:

// drivers/cpufreq/cpufreq.c/* 开始频率转换 */voidcpufreq_freq_transition_begin(structcpufreq_policy *policy, structcpufreq_freqs *freqs){ /* 防止重复调用导致死锁 */ WARN_ON(!(cpufreq_driver->flags & CPUFREQ_ASYNC_NOTIFICATION) && current == policy->transition_task);wait: /* 等待前一次转换完成 */ wait_event(policy->transition_wait, !policy->transition_ongoing); spin_lock(&policy->transition_lock); if(unlikely(policy->transition_ongoing)) { spin_unlock(&policy->transition_lock); gotowait; } policy->transition_ongoing =true; policy->transition_task = current; spin_unlock(&policy->transition_lock); /* 发送 PRECHANGE 通知 */ cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE);}/* 频率转换通知 */staticvoidcpufreq_notify_transition(structcpufreq_policy *policy, structcpufreq_freqs *freqs, unsignedintstate){ intcpu; freqs->policy = policy; freqs->flags = cpufreq_driver->flags; switch(state) { caseCPUFREQ_PRECHANGE: /* 同步 old frequency */ if(policy->cur && policy->cur != freqs->old) { freqs->old = policy->cur; } /* 调用 transition notifier 链 */ srcu_notifier_call_chain(&cpufreq_transition_notifier_list, CPUFREQ_PRECHANGE, freqs); /* 调整 loops_per_jiffy */ adjust_jiffies(CPUFREQ_PRECHANGE, freqs); break; caseCPUFREQ_POSTCHANGE: adjust_jiffies(CPUFREQ_POSTCHANGE, freqs); /* 记录 tracepoint */ for_each_cpu(cpu, policy->cpus) trace_cpu_frequency(freqs->new, cpu); /* 调用 transition notifier 链 */ srcu_notifier_call_chain(&cpufreq_transition_notifier_list, CPUFREQ_POSTCHANGE, freqs); /* 更新统计 */ cpufreq_stats_record_transition(policy, freqs->new); policy->cur = freqs->new; }}/* 结束频率转换 */voidcpufreq_freq_transition_end(structcpufreq_policy *policy, structcpufreq_freqs *freqs, inttransition_failed){ cpufreq_notify_post_transition(policy, freqs, transition_failed); /* 更新频率缩放比例(用于调度器) */ arch_set_freq_scale(policy->related_cpus, policy->cur, policy->cpuinfo.max_freq); spin_lock(&policy->transition_lock); policy->transition_ongoing =false; policy->transition_task =NULL; spin_unlock(&policy->transition_lock); wake_up(&policy->transition_wait);}

4.4驱动层实现:cpufreq-dt

DT驱动是通用层,对接核心层和平台层,核心是频率/电压的实际设置:

// drivers/cpufreq/cpufreq-dt.c/* 设置目标频率 */staticintset_target(structcpufreq_policy *policy,unsignedintindex){ structprivate_data*priv = policy->driver_data; unsignedlongfreq = policy->freq_table[index].frequency;#ifdefCONFIG_ARCH_ROCKCHIP /* RK 平台使用特殊的 OPP 设置函数 */ returnrockchip_cpufreq_opp_set_rate(priv->cpu_dev, freq *1000);#else returndev_pm_opp_set_rate(priv->cpu_dev, freq *1000);#endif}/* 驱动初始化 */staticintcpufreq_init(structcpufreq_policy *policy){ structprivate_data*priv; structdevice*cpu_dev; structclk*cpu_clk; unsignedinttransition_latency; intret; priv =cpufreq_dt_find_data(policy->cpu); if(!priv) { pr_err("failed to find data for cpu%dn", policy->cpu); return-ENODEV; } cpu_dev = priv->cpu_dev; /* 获取 CPU 时钟 */ cpu_clk =clk_get(cpu_dev,NULL); if(IS_ERR(cpu_clk)) { ret =PTR_ERR(cpu_clk); dev_err(cpu_dev,"%s: failed to get clk: %dn", __func__, ret); returnret; } /* 获取最大转换延迟 */ transition_latency =dev_pm_opp_get_max_transition_latency(cpu_dev); if(!transition_latency) transition_latency = CPUFREQ_ETERNAL; /* 填充 policy */ cpumask_copy(policy->cpus, priv->cpus); policy->driver_data = priv; policy->clk = cpu_clk; policy->freq_table = priv->freq_table; policy->suspend_freq =dev_pm_opp_get_suspend_opp_freq(cpu_dev) /1000; policy->cpuinfo.transition_latency = transition_latency; policy->dvfs_possible_from_any_cpu =true; /* 支持 boost 模式 */ if(policy_has_boost_freq(policy)) { ret =cpufreq_enable_boost_support(); if(ret) gotoout_clk_put; cpufreq_dt_attr[1] = &cpufreq_freq_attr_scaling_boost_freqs; } return0;out_clk_put: clk_put(cpu_clk); returnret;}staticstructcpufreq_driverdt_cpufreq_driver = { .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK | CPUFREQ_IS_COOLING_DEV, .verify = cpufreq_generic_frequency_table_verify, .target_index = set_target, .get = cpufreq_generic_get, .init = cpufreq_init, .exit = cpufreq_exit, .online = cpufreq_online, .offline = cpufreq_offline, .register_em = cpufreq_register_em_with_opp, .name ="cpufreq-dt", .attr = cpufreq_dt_attr, .suspend = cpufreq_generic_suspend,};

4.5 RK平台特殊处理:rockchip_cpufreq

RK3576的定制化逻辑,包括DVFS锁、Read Margin、多路电压调节器:

// drivers/cpufreq/rockchip-cpufreq.c/* RK 平台频率设置入口 */introckchip_cpufreq_opp_set_rate(structdevice *dev,unsignedlongtarget_freq){ structcluster_info*cluster; structdev_pm_opp*opp; structrockchip_opp_info*opp_info; structdev_pm_opp_supplysupplies[2] = {0}; unsignedlongfreq; intret =0; cluster =rockchip_cluster_info_lookup(dev->id); if(!cluster) return-EINVAL; opp_info = &cluster->opp_info; /* 获取 DVFS 锁,防止并发修改 */ rockchip_opp_dvfs_lock(opp_info);
 /* 调用 OPP 子系统设置频率 */ ret =dev_pm_opp_set_rate(dev, target_freq);
 if(!ret) { cluster->rate = freq = target_freq;
 /* 查找当前 OPP,获取电压信息 */ opp =dev_pm_opp_find_freq_ceil(dev, &freq); if(!IS_ERR(opp)) { dev_pm_opp_get_supplies(opp, supplies); cluster->volt = supplies[0].u_volt; if(opp_info->regulator_count >1) cluster->mem_volt = supplies[1].u_volt; dev_pm_opp_put(opp); } }
 rockchip_opp_dvfs_unlock(opp_info); returnret;}EXPORT_SYMBOL_GPL(rockchip_cpufreq_opp_set_rate);/* RK3576 Read Margin 设置 */staticintrk3576_cpu_set_read_margin(structdevice *dev, structrockchip_opp_info *opp_info, u32 rm){ if(!opp_info->volt_rm_tbl) return0; if(rm == opp_info->current_rm || rm == UINT_MAX) return0; dev_dbg(dev,"set rm to %dn", rm);
 /* 通过 GRF 配置 Read Margin */ if(opp_info->grf) { /* CPU0-3 核心 */ regmap_write(opp_info->grf,0x3c,0x001c0000| (rm << 2)); regmap_write(opp_info->grf,0x44,0x001c0000| (rm << 2));
 /* 触发更新 */ regmap_write(opp_info->grf,0x38,0x00020002); udelay(1); regmap_write(opp_info->grf,0x38,0x00020000); }
 /* CCI 接口 */ if(opp_info->cci_grf) regmap_write(opp_info->cci_grf,0x54,0x001c0000| (rm << 2)); opp_info->current_rm = rm; return0;}/* 多路 regulator 配置 */staticintcpu_opp_config_regulators(structdevice *dev, structdev_pm_opp *old_opp, structdev_pm_opp *new_opp, structregulator **regulators, unsignedintcount){ structcluster_info*cluster; cluster =rockchip_cluster_info_lookup(dev->id); if(!cluster) return-EINVAL; returnrockchip_opp_config_regulators(dev, old_opp, new_opp, regulators, count, &cluster->opp_info);}

流程图:RK3576频率调整完整流程

st=>start: 调度器触发负载更新op1=>operation: interactive调速器(eval_target_freq)op2=>operation: 计算CPU负载,选择目标频率op3=>operation: 核心层(cpufreq_freq_transition_begin)op4=>operation: 发送PRECHANGE通知op5=>operation: 驱动层(set_target)op6=>operation: RK平台(rockchip_cpufreq_opp_set_rate)op7=>operation: OPP子系统(dev_pm_opp_set_rate)op8=>operation: 升频:先升压后升频;降频:先降频后降压op9=>operation: 核心层(cpufreq_freq_transition_end)op10=>operation: 发送POSTCHANGE通知,更新统计e=>end: 频率切换完成st->op1->op2->op3->op4->op5->op6->op7->op8->op9->op10->e

五、OPP子系统:频率-电压表的奥秘

OPPOperating Performance Point)是连接软件和硬件的关键,定义了频率-电压的映射关系,是DVFS的基础。

5.1 OPP数据结构

// include/linux/pm_opp.hstructdev_pm_opp{ structlist_headnode;
 unsignedlongrate; // 频率 (Hz) unsignedlongu_volt; // 电压 (uV) unsignedlongu_volt_min; // 最小电压 unsignedlongu_volt_max; // 最大电压
 structdevice_opp*dev_opp; // 所属设备
 /* 支持的条件 */ unsignedlongsupported_hw; // 硬件版本掩码
 /* 供电信息 */ structopp_supply*supplies; unsignedintsupply_count;
 /* 自定义数据 */ void*priv;};structdevice_opp{ structlist_headnode; structdevice*dev; structsrcu_notifier_headsrcu_head; structlist_headopp_list;
 structclk*clk; structregulator**regulators; unsignedintregulator_count;
 structopp_table*opp_table;};

5.2 Device Tree OPP定义

RK3576OPP表在DTS中定义,不同芯片版本支持不同频率:

// arch/arm64/boot/dts/rockchip/rk3576.dtsicpu0_opp_table:opp-table-0{ compatible ="operating-points-v2"; opp-shared;
 /* 408 MHz */ opp-408000000{ opp-hz =/bits/64<408000000>; opp-microvolt = <800000>; clock-latency-ns = <40000>; };
 /* 600 MHz */ opp-600000000{ opp-hz =/bits/64<600000000>; opp-microvolt = <825000>; };
 /* 816 MHz */ opp-816000000{ opp-hz =/bits/64<816000000>; opp-microvolt = <850000>; };
 /* 1.2 GHz */ opp-1200000000{ opp-hz =/bits/64<1200000000>; opp-microvolt = <925000>; };
 /* 1.608 GHz - 仅支持特定芯片版本 */ opp-1608000000{ opp-hz =/bits/64<1608000000>; opp-microvolt = <1100000>; opp-supported-hw = <0x10x1>; //bin=0, volt_sel=0 };
 /* 1.8 GHz - 更高规格芯片 */ opp-1800000000{ opp-hz =/bits/64<1800000000>; opp-microvolt = <1175000>; opp-supported-hw = <0x10x3>; //bin=0, volt_sel=0,1,2 };};

5.3 OPP查找与设置流程

OPP子系统的核心函数,负责频率/电压的实际设置:

// drivers/opp/core.c/*** dev_pm_opp_set_rate() - 设置设备到指定频率* @dev: 设备* @target_freq: 目标频率 (Hz)** 1. 查找匹配的 OPP* 2. 设置 regulator 电压* 3. 设置时钟频率*/intdev_pm_opp_set_rate(structdevice *dev,unsignedlongtarget_freq){ structdevice_opp*dev_opp; structdev_pm_opp*opp; structclk*clk; unsignedlongold_freq, new_freq; intret; dev_opp = _find_device_opp(dev); if(IS_ERR(dev_opp)) returnPTR_ERR(dev_opp); clk = dev_opp->clk; old_freq =clk_get_rate(clk); /* 查找目标频率对应的 OPP */ opp = _find_freq_ceil(dev_opp, &target_freq); if(IS_ERR(opp)) { ret =PTR_ERR(opp); gotoput_opp; } new_freq = opp->rate; /* 如果频率相同,只更新电压 */ if(new_freq == old_freq) { ret = _set_opp_voltage(dev, dev_opp, opp); gotoput_opp; } /* 升频:先升压,后升频 */ if(new_freq > old_freq) { ret = _set_opp_voltage(dev, dev_opp, opp); if(ret) gotoput_opp;
 ret =clk_set_rate(clk, new_freq); if(ret) { /* 回滚电压 */ _set_opp_voltage(dev, dev_opp, _find_freq_floor(dev_opp, &old_freq)); } }else{ /* 降频:先降频,后降压 */ ret =clk_set_rate(clk, new_freq); if(ret) gotoput_opp;
 _set_opp_voltage(dev, dev_opp, opp); }put_opp: dev_pm_opp_put(opp); returnret;}

流程图:OPP频率-电压设置流程

st=>start: 调用dev_pm_opp_set_rateop1=>operation: 查找设备对应的device_oppop2=>operation: 获取当前时钟频率(old_freq)op3=>operation: 查找目标频率的OPP(ceil)op4=>operation: 判断new_freq == old_freq?op5=>operation: 仅更新电压(_set_opp_voltage)op6=>operation: new_freq > old_freq?op7=>operation: 先升压,后升频op8=>operation: 升频失败,回滚电压op9=>operation: 先降频,后降压e=>end: 返回设置结果st->op1->op2->op3->op4op4(yes)->op5->eop4(no)->op6op6(yes)->op7->eop7(no)->op8->eop6(no)->op9->e

六、通知链机制:频率变更的广播系统

cpufreq的通知链是事件广播机制,允许其他子系统(如温控、功耗管理)监听频率变更事件,是内核模块化设计的典型体现。

6.1两种通知链

// drivers/cpufreq/cpufreq.c/* 策略通知链:策略创建/销毁时调用 */staticBLOCKING_NOTIFIER_HEAD(cpufreq_policy_notifier_list);/* 转换通知链:频率变更前后调用 */SRCU_NOTIFIER_HEAD_STATIC(cpufreq_transition_notifier_list);

6.2 RK平台的通知器注册

RK3576通过通知链实现频率-空闲状态联动、总线QoS约束:

// drivers/cpufreq/rockchip-cpufreq.c/* 策略通知器:处理监控注册和总线 QoS */staticintrockchip_cpufreq_notifier(structnotifier_block *nb, unsignedlongevent,void*data){ structcpufreq_policy *policy = data; structcluster_info *cluster; cluster = rockchip_cluster_info_lookup(policy->cpu); if(!cluster) returnNOTIFY_BAD; switch(event) { caseCPUFREQ_CREATE_POLICY: /* 注册系统监控 */ if(rockchip_cpufreq_add_monitor(cluster, policy)) returnNOTIFY_BAD;
 /* 添加总线频率 QoS 约束 */ if(rockchip_cpufreq_add_bus_qos_req(cluster, policy)) returnNOTIFY_BAD; break; caseCPUFREQ_REMOVE_POLICY: rockchip_cpufreq_remove_monitor(cluster); rockchip_cpufreq_remove_bus_qos(cluster); break; } returnNOTIFY_OK;}/* 转换通知器:处理空闲状态 */staticintrockchip_cpufreq_transition_notifier(structnotifier_block *nb, unsignedlongevent,void*data){ structcpufreq_freqs *freqs = data; structcpufreq_policy *policy = freqs->policy; structcluster_info *cluster; cluster = rockchip_cluster_info_lookup(policy->cpu); if(!cluster) returnNOTIFY_BAD; switch(event) { caseCPUFREQ_PRECHANGE: /* 高频时禁用深层空闲状态 */ if(cluster->idle_threshold_freq && freqs->new>= cluster->idle_threshold_freq && !cluster->is_idle_disabled) { rockchip_cpufreq_idle_state_disable(policy->cpus,1,true); cluster->is_idle_disabled =true; } break; caseCPUFREQ_POSTCHANGE: /* 低频时重新启用空闲状态 */ if(cluster->idle_threshold_freq && freqs->new< cluster->idle_threshold_freq && cluster->is_idle_disabled) { rockchip_cpufreq_idle_state_disable(policy->cpus,1,false); cluster->is_idle_disabled =false; }
 /* 更新总线频率请求 */ rockchip_cpufreq_update_bus_req(cluster, freqs->new); break; } returnNOTIFY_OK;}

6.3使用场景示例

温控驱动通过通知链监听频率变更,更新热模型:

/* 温度管理驱动注册通知器 */staticintthermal_cpufreq_notifier(structnotifier_block *nb, unsignedlongevent,void*data){ structcpufreq_freqs *freqs = data;
 if(event== CPUFREQ_POSTCHANGE) { /* 频率变更后更新热模型 */ update_thermal_model(freqs->new); }
 returnNOTIFY_OK;}staticstructnotifier_block thermal_nb = { .notifier_call = thermal_cpufreq_notifier,};/* 注册 */cpufreq_register_notifier(&thermal_nb, CPUFREQ_TRANSITION_NOTIFIER);

七、统计与调试:洞察系统行为

掌握调试技巧,才能快速定位频率调节的问题,cpufreq提供了丰富的统计和调试接口。

7.1统计模块实现

// drivers/cpufreq/cpufreq_stats.cstructcpufreq_stats{ unsignedinttotal_trans; // 总切换次数 unsignedlonglonglast_time; // 上次更新时间 unsignedintmax_state; // 最大状态数 unsignedintstate_num; // 实际状态数 unsignedintlast_index; // 当前状态索引 u64 *time_in_state; // 各频率驻留时间 unsignedint*freq_table; // 频率表 unsignedint*trans_table; // 切换矩阵};/* 记录一次频率切换 */voidcpufreq_stats_record_transition(structcpufreq_policy *policy, unsignedintnew_freq){ structcpufreq_stats*stats = policy->stats; intold_index, new_index; if(!stats) return; old_index = stats->last_index; new_index =freq_table_get_index(stats, new_freq); if(old_index ==-1|| new_index ==-1|| old_index == new_index) return; /* 更新驻留时间 */ cpufreq_stats_update(stats, stats->last_time); stats->last_index = new_index; stats->trans_table[old_index * stats->max_state + new_index]++; stats->total_trans++;}

7.2调试技巧

# 1. 查看当前频率和策略cat/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freqcat/sys/devices/system/cpu/cpu0/cpufreq/scaling_governorcat/sys/devices/system/cpu/cpu0/cpufreq/scaling_min_freqcat/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq# 2. 查看频率驻留时间统计cat/sys/devices/system/cpu/cpu0/cpufreq/stats/time_in_state# 输出格式: 频率(Hz) 时间(时钟周期)# 408000 1234567# 600000 2345678# ...# 3. 查看频率切换矩阵cat/sys/devices/system/cpu/cpu0/cpufreq/stats/trans_table# 显示从每个频率切换到其他频率的次数# 4. 查看可用频率和调速器cat/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequenciescat/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_governors# 5. 动态切换调速器echouserspace > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governorecho1200000 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed# 6. 内核日志调试dmesg | grep -i cpufreqdmesg | grep -i"rk3576|rockchip"# 7. 使用 tracepointcd/sys/kernel/debug/tracingecho1 > events/cpufreq/enablecattrace

八、实战:自定义调速器开发

掌握自定义调速器的开发,才能适配特定场景的性能/功耗需求。

8.1最小调速器框架

#include
      #include
      staticintmygov_target(structcpufreq_policy *policy,unsignedinttarget_freq, unsignedintrelation){ /* 简单的频率设置 */ return__cpufreq_driver_target(policy, target_freq, relation);}staticintmygov_init(structcpufreq_policy *policy){ /* 初始化 */ return0;}staticvoidmygov_exit(structcpufreq_policy *policy){ /* 清理 */}staticstructcpufreq_governormy_governor = { .name ="mygov", .target = mygov_target, .init = mygov_init, .exit = mygov_exit, .owner = THIS_MODULE,};staticint__initmygov_init_module(void){ returncpufreq_register_governor(&my_governor);}staticvoid__exitmygov_exit_module(void){ cpufreq_unregister_governor(&my_governor);}module_init(mygov_init_module);module_exit(mygov_exit_module);MODULE_LICENSE("GPL");

8.2基于负载的调速器

staticvoidmygov_update(structcpufreq_policy *policy){ unsignedintload =calculate_cpu_load(policy); unsignedintnew_freq; if(load >80) { /* 高负载:升频到最大 */ new_freq = policy->max; }elseif(load < 20) { /* 低负载:降频到最小 */ new_freq = policy->min; }else{ /* 中等负载:线性插值 */ new_freq = policy->min + load * (policy->max - policy->min) /100; } __cpufreq_driver_target(policy, new_freq, CPUFREQ_RELATION_H);}

脑图:自定义调速器开发

九、性能优化实战

针对实际开发中的痛点,优化频率切换延迟、负载计算精度。

9.1减少频率切换延迟

/* 1. 使用 fast frequency switching */staticstructcpufreq_drivermy_driver={ .flags=CPUFREQ_FAST_SWITCHING, .fast_switch=my_fast_switch, // 原子上下文切换};/* 2. 减少 transition latency */// 在 OPP 表中设置较小的 clock-latency-nsopp-1200000000{ opp-hz=/bits/64<1200000000>; clock-latency-ns=<10000>; // 10us,而不是 40us};

9.2优化负载计算

/* 使用 PELT 信号代替 idle time */#include
      staticunsignedintget_pelt_load(structcpufreq_policy *policy){ unsignedintcpu = policy->cpu; structrq*rq =cpu_rq(cpu);
 /* PELT (Per-Entity Load Tracking) 是调度器内部的负载跟踪机制 */ returnrq->cfs.avg.util_avg; // 0 ~ 1024}

脑图:性能优化实战

wKgZPGnRtT2ATjvXAAHk74WONG8190.png

十、总结与延伸

10.1核心要点回顾

1.分层架构governor → core → driver → OPP → clk/regulator

2.线程安全transition_lockDVFS locknotifier chain

3.硬件抽象OPP表统一频率-电压关系

4.平台特性RK3576Read Margin、多路regulator

10.2延伸阅读

Documentation/cpu-freq/-内核文档

drivers/opp/- OPP子系统实现

drivers/cpufreq/cpufreq_schedutil.c-最新schedutil调速器

include/linux/cpufreq.h-完整API定义

10.3调试checklist

dmesg | grep cpufreq查看初始化日志

cat scaling_available_frequencies确认频率表

cat time_in_state确认频率切换正常

trace-cmd record -e cpufreq抓取切换事件

检查regulator是否支持动态电压调节

写在最后

cpufreqLinux内核中最接近硬件的子系统之一,理解它不仅能帮你优化系统性能,更能深入理解内核的设备模型、电源管理和并发控制机制。