1. IO 性能优化架构
BayBridge 芯片 IO 性能优化的核心目标:
- 最大化吞吐量:充分利用 eMMC HS400 带宽(~300MB/s)
- 最小化延迟:减少 NVMe 命令到 eMMC 执行的路径延迟
- 高效并发:多任务队列并行调度
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
| ┌─────────────────────────────────────────────────────────────────┐ │ Host (NVMe Driver) │ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │ │ SQ 0 │ │ SQ 1 │ │ SQ 2 │ │ SQ 3 │ ... │ │ │ (Admin) │ │ (IO) │ │ (IO) │ │ (IO) │ │ │ └────┬────┘ └────┬────┘ └────┬────┘ └────┬────┘ │ └───────│────────────│────────────│────────────│──────────────────┘ │ PCIe │ │ │ ┌───────▼────────────▼────────────▼────────────▼──────────────────┐ │ BayBridge Controller │ │ ┌──────────────────────────────────────────────────────────┐ │ │ │ Hardware Command Fetch Engine │ │ │ │ ┌─────────┐ ┌─────────┐ │ │ │ │ │CMD BUF 0│◄─── Ping ───►│CMD BUF 1│ │ │ │ │ └────┬────┘ Pong └────┬────┘ │ │ │ └─────────│───────────────────────│─────────────────────────┘ │ │ │ │ │ │ ┌─────────▼───────────────────────▼─────────────────────────┐ │ │ │ Firmware Task Queue (32 depth × 2) │ │ │ │ ┌─────────────────────┐ ┌─────────────────────┐ │ │ │ │ │ eMMC #0 Task Queue │ │ eMMC #1 Task Queue │ │ │ │ │ │ [PRE_CANDIDATE] │ │ [PRE_CANDIDATE] │ │ │ │ │ │ [CANDIDATE] │ │ [CANDIDATE] │ │ │ │ │ │ [READY] │ │ [READY] │ │ │ │ │ │ [EXECUTING] │ │ [EXECUTING] │ │ │ │ │ │ [COMPLETED] │ │ [COMPLETED] │ │ │ │ │ └─────────┬───────────┘ └───────────┬─────────┘ │ │ │ └────────────│──────────────────────────│───────────────────┘ │ │ │ │ │ │ ┌────────────▼──────────────────────────▼───────────────────┐ │ │ │ Data Transfer Controller (DMA + PRP) │ │ │ └────────────────────────────┬──────────────────────────────┘ │ └───────────────────────────────│──────────────────────────────────┘ │ eMMC Interface (HS400) ┌───────────────────────────────▼──────────────────────────────────┐ │ eMMC (Command Queuing) │ │ ┌──────────────────────┐ ┌──────────────────────┐ │ │ │ eMMC #0 │ │ eMMC #1 │ │ │ │ CMD44/45 → CQ Depth │ │ CMD44/45 → CQ Depth │ │ │ │ CMD46/47 → Execute │ │ CMD46/47 → Execute │ │ │ └──────────────────────┘ └──────────────────────┘ │ └──────────────────────────────────────────────────────────────────┘
|
2. 双 Buffer 命令获取机制
2.1 Ping-Pong Buffer 设计
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
|
byte global_buffer_identifier = 0;
static void fetch_command(byte buffer_identify, command_p command) { if (buffer_identify == 0) { fn_memcpy_4bytes(command, (void *)BAYBRIDGE_CMD0_BASE, sizeof(command_t)); } else { fn_memcpy_4bytes(command, (void *)BAYBRIDGE_CMD1_BASE, sizeof(command_t)); }
command->prp.prp_entry1 = leon2_cpu_64bits_swap(command->prp.prp_entry1); command->prp.prp_entry2 = leon2_cpu_64bits_swap(command->prp.prp_entry2); }
|
2.2 中断驱动的命令处理
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
| void interrupt_process(void) { u32 internal_interrupt_status = BAYBRIDGE_INTERNAL_INTERRUPT_STATUS;
if (internal_interrupt_status & CMD_BUFFER0_FETCH_COMPLETE_INTERRUPT_STATUS_BIT) { BAYBRIDGE_INTERNAL_INTERRUPT_STATUS = CMD_BUFFER0_FETCH_COMPLETE_INTERRUPT_STATUS_BIT; global_buffer_identifier = 0; pm.update_task_counter(&pm_dscp, NVME_TASK, ADD); command_parse(); }
if (internal_interrupt_status & CMD_BUFFER1_FETCH_COMPLETE_INTERRUPT_STATUS_BIT) { BAYBRIDGE_INTERNAL_INTERRUPT_STATUS = CMD_BUFFER1_FETCH_COMPLETE_INTERRUPT_STATUS_BIT; global_buffer_identifier = 1; pm.update_task_counter(&pm_dscp, NVME_TASK, ADD); command_parse(); } }
|
优化效果:
- 当处理 Buffer 0 命令时,硬件可同时向 Buffer 1 填充下一条命令
- 减少命令获取等待时间,提高命令吞吐量
3. 多级任务队列设计
3.1 任务状态机
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
| ┌────────────────────────────────────────────────────────────────────────┐ │ Task State Machine │ │ │ │ ┌─────────┐ ┌────────────────┐ ┌────────────────┐ │ │ │ EMPTY │───►│ PRE_CANDIDATE │───►│ CANDIDATE │ │ │ │ (0x00) │ │ (0x01) │ │ (0x02) │ │ │ └────▲────┘ └────────────────┘ └───────┬────────┘ │ │ │ │ │ │ │ eMMC Reports ▼ │ │ │ Task Ready ┌────────────────┐ │ │ │ (QSR Query) │ READY │ │ │ │ │ (0x05) │ │ │ │ └───────┬────────┘ │ │ │ │ │ │ │ Execute CMD46/47 ▼ │ │ │ ┌────────────────┐ │ │ │ │ EXECUTING │ │ │ │ │ (0x03) │ │ │ │ └──┬─────────┬───┘ │ │ │ │ │ │ │ │ Error │ │ Success │ │ │ ▼ ▼ │ │ │ ┌────────────────┐ ┌─────────────────────┐ │ │ │ │ ERR_STATUS │ │ COMPLETED │ │ │ │ │ (0x7F) │ │ (0x04) │ │ │ │ └───────┬────────┘ └──────────┬──────────┘ │ │ │ │ │ │ │ │ │ Recovery │ Send Completion │ │ └──────────┴────────────────────────────┘ │ │ │ └─────────────────────────────────────────────────────────────────────────┘
|
3.2 任务状态定义
1 2 3 4 5 6 7 8
| #define EMPTY 0x00 #define PRE_CANDIDATE 0x01 #define CANDIDATE 0x02 #define EXECUTING 0x03 #define COMPLETED 0x04 #define READY 0x05 #define ERR_STATUS 0x7F
|
3.3 任务插入流程
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
|
int insert_emmc0_task_queue(byte queue_identifier, command_p command, byte bwrite) { int i = 0;
for (i = 0; i < global_cq_task_table_size[0]; i++) { if (emmc_firmware_task_status_flag[0][i] == EMPTY) { set_cmd_info(queue_identifier, command, bwrite, &emmc_firmware_task_queue[0][i], 0);
global_emmc_firmware_task_status_map[0][global_emmc_firmware_task_status_map_index[0]] = i; global_emmc_firmware_task_status_map_index[0]++; global_empty_task_num[0]--; emmc_firmware_task_status_flag[0][i] = PRE_CANDIDATE;
if (global_empty_task_num[0] < 3) { stop_nvme_command_fetch(1); } break; } } if (i == global_cq_task_table_size[0]) { return 2; } return 0; }
|
4. eMMC Command Queuing 实现
4.1 CQ 模式主流程
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
|
int emmc_read_write_cq_mode() { int ret = 0; u8 pre_candidate_id = 0; u8 card_slot = 0; u8 i = 0;
if (0 == card_num || 2 == card_num) { card_slot = 0; if (0 == global_dataset_management_commands.command_store_flag) { for (i = 0; i < global_emmc_firmware_task_status_map_index[card_slot]; i++) { pre_candidate_id = global_emmc_firmware_task_status_map[card_slot][i]; global_candidate_task_num_postive[card_slot]++; ret = add_task_to_cq(card_slot, pre_candidate_id); if (ret) goto exit;
EMMC0_CANDIDATE_TASK_STATUS_SET = 1 << pre_candidate_id; emmc_firmware_task_status_flag[card_slot][pre_candidate_id] = CANDIDATE; } global_emmc_firmware_task_status_map_index[card_slot] = 0; } }
if ((0 == card_info.emmc_task_status[1][0]) && (0 == card_info.emmc_task_status[1][1])) { global_inquery_task_status_at_while1 = 1; emmc_task_sequence_select_for_execute(card_slot); global_inquery_task_status_at_while1 = 0; }
if (global_invoking_task_executing_at_while1 && emmc_firmware_executing_task_completion) { ret = execute_ready_task(); }
exit: return ret; }
|
4.2 任务添加到 eMMC CQ
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
| int add_task_to_cq(u8 card_slot, u8 task_id) { int ret = 0; u32 intr_status;
retry: ret = cmd44_cq_set_task_params( card_slot, emmc_firmware_task_queue[card_slot][task_id].block_len, task_id, 0, !emmc_firmware_task_queue[card_slot][task_id].direction ); if (ret) { LOG(ERROR, "CMD44 failed: 0x%X\n", ret); goto recovery; }
ret = cmd45_cq_set_task_address( card_slot, (u32)emmc_firmware_task_queue[card_slot][task_id].address ); if (ret) { LOG(ERROR, "CMD45 failed: 0x%X\n", ret); }
recovery: if (ret) { intr_status = Card_Error; if (intr_status & 0x090F0000) { card_info.CQ_cmd_recovery_result = 1; ret = emmc_error_recovery(intr_status, card_slot, ..., 1, task_id); if (ret == 0) { goto retry; } } } return ret; }
|
4.3 任务就绪状态查询
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
| int get_qsr(u8 card_slot, u32* qsr) { int ret = 0; u32 argument = 0x00008000;
if (global_candidate_task_num_postive[card_slot] == global_candidate_task_num_nagtive[card_slot]) { return ret; } ret = cmd13_card_send_status(argument, card_slot); if (ret) { LOG(ERROR, "CMD13 failed\n"); host_software_reset(reset_cmd_line); return ret; }
*qsr = (card_info.response_buf[0] << 24 | card_info.response_buf[1] << 16 | card_info.response_buf[2] << 8 | card_info.response_buf[3]);
return ret; }
int get_task_ready_status_update(u8 card_slot) { u32 temp_qsr = 0; int ret = 0;
if (0 != card_info.emmc_task_status[card_slot][0]) return ret;
ret = get_qsr(card_slot, &temp_qsr); if (ret) return ret;
card_info.emmc_task_status[card_slot][0] = temp_qsr & EMMC_CANDIDATE_TASK_STATUS_REG(card_slot); EMMC_CANDIDATE_TASK_STATUS_CLEAR(card_slot) = card_info.emmc_task_status[card_slot][0];
return ret; }
|
4.4 执行数据传输
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
| int execute_select_task(u8 card_slot) { int ret = 0; task_info_struct *task = &emmc_firmware_task_queue[card_slot][emmc_firmware_executing_task_id];
BAYBRIDGE_DATA_TRANSFER_CTRL_1 = task->prp1 & 0xFFFFFFFF; BAYBRIDGE_DATA_TRANSFER_CTRL_2 = (task->prp1 >> 32) & 0xFFFFFFFF; BAYBRIDGE_DATA_TRANSFER_CTRL_3 = task->prp2 & 0xFFFFFFFF; BAYBRIDGE_DATA_TRANSFER_CTRL_4 = (task->prp2 >> 32) & 0xFFFFFFFF;
if (task->direction) { BAYBRIDGE_DATA_TRANSFER_CTRL_5 = task->block_len * 512 | START_DATA_TRANSFER_BIT; ret = cmd47_cq_execute_write_task( card_slot, ((u32)emmc_firmware_executing_task_id) << 16, 0, task->block_len * 512 ); } else { BAYBRIDGE_DATA_TRANSFER_CTRL_5 = task->block_len * 512 | DATA_TRANSFER_DIRECTION_WRITE_TO_SYSTEM_MEMORY | START_DATA_TRANSFER_BIT; ret = cmd46_cq_execute_read_task( card_slot, ((u32)emmc_firmware_executing_task_id) << 16, 0, task->block_len * 512 ); }
if (ret) { card_info.CQ_cmd_recovery_result = 2; ret = emmc_error_recovery(...); }
return ret; }
|
5. 跨 eMMC 边界处理(Fused Command)
5.1 双 eMMC 容量映射
当系统配置为双 eMMC 组成单一 Namespace 时:
1 2 3 4 5 6 7 8 9 10 11 12
| Host 视角 (单一 Namespace) ┌──────────────────────────────────────────────────────────────┐ │ Total Capacity = eMMC0 + eMMC1 │ │ LBA 0 ────────────────────────────────────► LBA MAX │ └──────────────────────────────────────────────────────────────┘ │ ▼ Firmware 映射 (内部视角) ┌────────────────────────────┬─────────────────────────────────┐ │ eMMC #0 │ eMMC #1 │ │ LBA 0 ────► eMMC0_MAX │ LBA 0 ────► eMMC1_MAX │ └────────────────────────────┴─────────────────────────────────┘
|
5.2 Fused Command 处理
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
|
int read_write_process(byte queue_identifier, command_p command, byte bwrite) { u64 address = command->rw_dw10.starting_lba_low | (u64)command->rw_dw11.starting_lba_high << 32; u16 block_len = command->rw_dw12.number_of_logical_blocks + 1;
if (ns_emmc_map.case_id == DOUBLE_EMMC_TOTAL_NS) { if (address < card_info.emmc_max_sect[0] && card_info.emmc_max_sect[0] < (address + block_len)) { return insert_cross_boundary_task_to_task_queue(queue_identifier, command, bwrite); } else if (address < card_info.emmc_max_sect[0]) { return insert_emmc0_task_queue(queue_identifier, command, bwrite); } else { command->rw_dw10.starting_lba_low = address - card_info.emmc_max_sect[0]; return insert_emmc1_task_queue(queue_identifier, command, bwrite); } } }
int insert_cross_boundary_task_to_task_queue(byte queue_identifier, command_p command, byte bwrite) { u64 address = command->rw_dw10.starting_lba_low | (u64)command->rw_dw11.starting_lba_high << 32; u16 block_len = command->rw_dw12.number_of_logical_blocks + 1; u16 emmc0_block_len = card_info.emmc_max_sect[0] - address; u16 emmc1_block_len = block_len - emmc0_block_len; int task0_id = find_empty_slot(0); int task1_id = find_empty_slot(1); if (task0_id < 0 || task1_id < 0) return 2; emmc_firmware_task_queue[0][task0_id].prp1 = command->prp.prp_entry1; emmc_firmware_task_queue[0][task0_id].prp2 = command->prp.prp_entry2; emmc_firmware_task_queue[0][task0_id].address = address; emmc_firmware_task_queue[0][task0_id].block_len = emmc0_block_len; emmc_firmware_task_queue[0][task0_id].fused_flag = FUSED_NEW; emmc_firmware_task_queue[0][task0_id].next_pointer = task1_id; emmc_firmware_task_queue[0][task0_id].start_card = 0;
emmc_firmware_task_queue[1][task1_id].prp1 = 0; emmc_firmware_task_queue[1][task1_id].prp2 = 0; emmc_firmware_task_queue[1][task1_id].address = 0; emmc_firmware_task_queue[1][task1_id].block_len = emmc1_block_len; emmc_firmware_task_queue[1][task1_id].fused_flag = FUSED_NEW; emmc_firmware_task_queue[1][task1_id].next_pointer = task0_id; emmc_firmware_task_queue[1][task1_id].start_card = 0;
return 0; }
|
5.3 Fused 任务执行同步
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
| void get_fused_task_ready_status(void) { u8 i = 0; u8 temp = 0; for (i = 0; i < global_cq_task_table_size[0]; i++) { if (FUSED_NEW == emmc_firmware_task_queue[0][i].fused_flag) { temp = emmc_firmware_task_queue[0][i].next_pointer; if (IS_TASK_READY(0, i) && IS_TASK_READY(1, temp)) { emmc_firmware_task_queue[0][i].fused_flag = FUSED_READY; emmc_firmware_task_queue[1][temp].fused_flag = FUSED_READY; card_info.emmc_task_status[1][0] &= ~(u32)(1 << temp); } else { card_info.emmc_task_status[0][0] &= ~(u32)(1 << i); card_info.emmc_task_status[1][0] &= ~(u32)(1 << temp); EMMC0_CANDIDATE_TASK_STATUS_SET |= (u32)(1 << i); EMMC1_CANDIDATE_TASK_STATUS_SET |= (u32)(1 << temp); } } } }
|
6. 中断与 Completion 优化
6.1 Interrupt Coalescing
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
| void send_complete(u16 command_identifier, byte queue_identifier, u32 spec, u16 status) {
if (status == 0 && queue_identifier != 0) { if (global_aggregation_threshold == 0) { INTERRUPT_REQUEST_STATUS_SET = 1 << global_completion_queue_interrupt_vector[cq_id]; } else { global_msi_interrupt_count[...]++; if (global_msi_interrupt_count[...] >= global_aggregation_threshold) { send_msi_interrupt(0); } } } else { INTERRUPT_REQUEST_STATUS_SET = 1 << global_completion_queue_interrupt_vector[cq_id]; } }
|
6.2 批量 Completion 发送
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
| void send_completion_task(void) { byte i = 0, j = 0;
for (j = 0; j < 2; j++) { if (global_complete_task_num_postive[j] != global_complete_task_num_nagtive[j]) { for (i = 0; i < global_cq_task_table_size[j]; i++) { if (emmc_firmware_task_status_flag[j][i] == COMPLETED) { if (emmc_firmware_task_queue[j][i].fused_flag == FUSED_EMPTY) { send_complete(emmc_firmware_task_queue[j][i].nvme_cmd_id, emmc_firmware_task_queue[j][i].nvme_sq_id, 0, 0); emmc_firmware_task_status_flag[j][i] = EMPTY; global_empty_task_num[j]++; global_complete_task_num_nagtive[j]++; } } else if (emmc_firmware_task_status_flag[j][i] == ERR_STATUS) { send_complete(..., SCT_GENERIC_COMMAND_STATUS | 0x82); emmc_firmware_task_status_flag[j][i] = EMPTY; global_empty_task_num[j]++; } } } }
if (global_empty_task_num[card_num] > 7) { stop_nvme_command_fetch(0); } }
|
7. 流量控制(背压机制)
7.1 命令获取控制
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
| void stop_nvme_command_fetch(byte stop) { switch (global_created_io_sq_num) { case 1: if (stop) { STOP_CMD_FETCH |= (HW_DOES_NOT_FETCH_A_COMMAND << SQ_1_SHIFT); } else { STOP_CMD_FETCH &= ~(HW_DOES_NOT_FETCH_A_COMMAND << SQ_1_SHIFT); } break; case 2: if (stop) { STOP_CMD_FETCH |= (HW_DOES_NOT_FETCH_A_COMMAND << SQ_1_SHIFT | HW_DOES_NOT_FETCH_A_COMMAND << SQ_2_SHIFT); } else { STOP_CMD_FETCH &= ~(...); } break; } }
|
7.2 背压触发条件
1 2 3 4 5 6 7 8 9
| if (global_empty_task_num[0] < 3) { stop_nvme_command_fetch(1); }
if (global_empty_task_num[card_num] > 7) { stop_nvme_command_fetch(0); }
|
8. 性能优化总结
| 优化技术 |
实现方式 |
性能提升 |
| 双 Buffer |
Ping-Pong CMD Buffer |
减少命令获取等待 |
| Command Queuing |
eMMC CQ (32 depth) |
提高随机 IOPS |
| 硬件位图 |
任务状态位图加速 |
减少遍历开销 |
| Fused Command |
跨 eMMC 边界并行 |
双 eMMC 带宽叠加 |
| Interrupt Coalescing |
批量中断发送 |
降低 CPU 中断负载 |
| 背压控制 |
动态流量控制 |
避免任务队列溢出 |
典型性能指标:
- 顺序读:~300 MB/s (HS400)
- 顺序写:~250 MB/s
- 随机读 4K QD32:~25K IOPS
- 随机写 4K QD32:~20K IOPS
9. 小数据块命令合并优化(1KB Write 场景)
9.1 问题背景
在特定 NAS 系统安装场景中,Linux 拷贝系统下发的 NVMe IO 命令具有以下特点:
| 问题特征 |
说明 |
| NVMe 命令粒度 |
每条命令仅传输 1KB 数据(尽管 NVMe 最小支持 4KB Buffer) |
| eMMC LBA 连续 |
多条 1KB 命令对应的 eMMC 逻辑地址是连续的 |
| CQ FIFO 阻塞 |
eMMC Command Queue 深度为 32,每条 1KB 命令占用一个槽位 |
| 性能瓶颈 |
写入速度仅 ~20 MB/s,远低于 eMMC HS400 带宽 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
| 问题场景时序(优化前) ┌──────────────────────────────────────────────────────────────────────────┐ │ NVMe Host BayBridge eMMC │ │ │ │ │ │ │ │─── 1KB Write (LBA=0) ───────►│ │ │ │ │─── 1KB Write (LBA=2) ───────►│ │ │ │ │─── 1KB Write (LBA=4) ───────►│── CMD44/45 (1KB) ───────►│ Slot 0 │ │ │─── 1KB Write (LBA=6) ───────►│── CMD44/45 (1KB) ───────►│ Slot 1 │ │ │ ... │ ... │ ... │ │ │─── 1KB Write (LBA=62) ──────►│── CMD44/45 (1KB) ───────►│ Slot 31 │ │ │ │ │ │ │ │ ╔════════════════════════════════════════════════════════════╗ │ │ │ ║ eMMC CQ 32 FIFO 已满!等待任务完成才能接收新命令 ║ │ │ │ ║ 每个 1KB 任务的 eMMC 命令开销 >> 数据传输时间 ║ │ │ │ ╚════════════════════════════════════════════════════════════╝ │ │ │ │ │ │ │ │─── 1KB Write (LBA=64) ──────►│ BLOCKED 等待... │ │ └──────────────────────────────────────────────────────────────────────────┘
|
9.2 优化方案架构
核心思想:软件 Fetch NVMe 命令,检测 eMMC LBA 连续性,合并多个 1KB 任务的数据到单个 eMMC Command Queue 任务。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
| 优化方案架构 ┌────────────────────────────────────────────────────────────────────────────┐ │ NVMe Command Merge Pipeline │ │ │ │ ┌──────────────────┐ ┌───────────────────────────────────────────┐ │ │ │ NVMe Host │ │ BayBridge Firmware (SW Fetch Mode) │ │ │ │ │ │ │ │ │ │ CMD[0]: 1KB LBA=0│─────►│ ┌─────────────────────────────────────┐ │ │ │ │ CMD[1]: 1KB LBA=2│─────►│ │ Merge Task List Manager │ │ │ │ │ CMD[2]: 1KB LBA=4│─────►│ │ ┌─────────────────────────────┐ │ │ │ │ │ CMD[3]: 1KB LBA=6│─────►│ │ │ check_new_task_node() │ │ │ │ │ │ │ │ │ │ - LBA 连续性检测 │ │ │ │ │ └──────────────────┘ │ │ │ - 合并阈值控制 (≤4 tasks) │ │ │ │ │ │ │ │ - Block 总数限制 (<1024) │ │ │ │ │ │ │ └─────────────────────────────┘ │ │ │ │ │ │ │ │ │ │ │ │ │ ┌───────────▼───────────┐ │ │ │ │ │ │ │ merge_task_list[32] │ │ │ │ │ │ │ │ ┌───────────────────┐│ │ │ │ │ │ │ │ │ head_index: 0 ││ │ │ │ │ │ │ │ │ end_index: 3 ││ │ │ │ │ │ │ │ │ total_blk_cnt: 8 ││← 4×1KB │ │ │ │ │ │ │ │ total_node_cnt: 4 ││ │ │ │ │ │ │ │ └───────────────────┘│ │ │ │ │ │ │ └───────────┬───────────┘ │ │ │ │ │ └─────────────────│─────────────────────┘ │ │ │ │ │ │ │ CMD44/45 (合并后 4KB) │ │ │ ▼ │ │ ┌──────────────────┐ │ ┌─────────────────────────────────────┐ │ │ │ eMMC CQ Slot 0 │◄─────│──│ 单个 eMMC 任务 = 4 个 NVMe 命令 │ │ │ │ (4KB = 4×1KB) │ │ └─────────────────────────────────────┘ │ │ └──────────────────┘ └────────────────────────────────────────────┘│ │ │ │ 优化效果:eMMC CQ 利用率提升 4 倍,写速度从 ~20MB/s 提升到 ~60MB/s │ └────────────────────────────────────────────────────────────────────────────┘
|
9.3 数据结构设计
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
| typedef struct { boolean freeze; u8 head_index; u8 end_index; u8 total_node_cnt; u32 begin_blk_addr; u32 total_blk_cnt; u32 end_blk_addr; } merge_task_list_t;
typedef struct { merge_task_list_t merge_task_list[2][32]; u8 merge_head_hash[2][32]; u8 pre_task_index[2]; u8 executing_list_head; u8 ready_task_search_begin; } card_info_struct;
typedef struct { u8 next; u8 pre; u8 fused_next; boolean freeze; } task_info_struct;
|
9.4 核心算法实现
9.4.1 任务连续性检测
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
| #define ISOLATE_TASK 0 #define MERGE_LIST_NODE 1
static int check_new_task_node(u8 card_slot, u8 index, u8 head) { if (card_info.pre_task_index[card_slot] == (u8)-1) { return ISOLATE_TASK; } if (card_info.merge_task_list[card_slot][head].total_blk_cnt != 0) { if (card_info.merge_task_list[card_slot][head].freeze == TRUE) { return ISOLATE_TASK; } if (card_info.merge_task_list[card_slot][head].total_blk_cnt > BB_FETCH_ENTRY_NUMBER - 1) { return ISOLATE_TASK; } if ((card_info.merge_task_list[card_slot][head].end_blk_addr == (u32)emmc_firmware_task_queue[card_slot][index].address) && (card_info.merge_task_list[card_slot][head].total_blk_cnt + emmc_firmware_task_queue[card_slot][index].block_len < 1024)) { return MERGE_LIST_NODE; } return ISOLATE_TASK; } if (emmc_firmware_task_queue[card_slot][card_info.pre_task_index[card_slot]].freeze == TRUE) { return ISOLATE_TASK; } u8 pre_idx = card_info.pre_task_index[card_slot]; u32 pre_end_addr = (u32)emmc_firmware_task_queue[card_slot][pre_idx].address + emmc_firmware_task_queue[card_slot][pre_idx].block_len; u32 cur_addr = (u32)emmc_firmware_task_queue[card_slot][index].address; u16 total_len = emmc_firmware_task_queue[card_slot][index].block_len + emmc_firmware_task_queue[card_slot][pre_idx].block_len; if ((pre_end_addr == cur_addr) && (total_len < 1024)) { return MERGE_LIST_NODE; } return ISOLATE_TASK; }
|
9.4.2 合并链表管理
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
| boolean try_insert_merge_task_list(u8 card_slot, u8 index) { boolean should_skip_task_node; u8 head = get_first_unfreeze_tasklist(card_slot, index); u8 new_node_type = check_new_task_node(card_slot, index, head); if (new_node_type == ISOLATE_TASK) { emmc_firmware_task_queue[card_slot][index].pre = card_info.pre_task_index[card_slot]; emmc_firmware_task_queue[card_slot][card_info.pre_task_index[card_slot]].next = index; card_info.pre_task_index[card_slot] = index; should_skip_task_node = FALSE; } else { emmc_firmware_task_queue[card_slot][index].pre = card_info.pre_task_index[card_slot]; emmc_firmware_task_queue[card_slot][card_info.pre_task_index[card_slot]].next = index; if (card_info.merge_task_list[card_slot][head].total_blk_cnt == 0) { u8 pre_idx = card_info.pre_task_index[card_slot]; card_info.merge_task_list[card_slot][pre_idx].head_index = pre_idx; card_info.merge_task_list[card_slot][pre_idx].end_index = pre_idx; card_info.merge_task_list[card_slot][pre_idx].total_node_cnt = 1; card_info.merge_task_list[card_slot][pre_idx].begin_blk_addr = (u32)emmc_firmware_task_queue[card_slot][pre_idx].address; card_info.merge_task_list[card_slot][pre_idx].total_blk_cnt = emmc_firmware_task_queue[card_slot][pre_idx].block_len; card_info.merge_head_hash[card_slot][pre_idx] = 1; head = pre_idx; } card_info.merge_task_list[card_slot][head].end_index = index; card_info.merge_task_list[card_slot][head].total_node_cnt++; card_info.merge_task_list[card_slot][head].total_blk_cnt += emmc_firmware_task_queue[card_slot][index].block_len; card_info.merge_task_list[card_slot][head].end_blk_addr = card_info.merge_task_list[card_slot][head].begin_blk_addr + card_info.merge_task_list[card_slot][head].total_blk_cnt; card_info.pre_task_index[card_slot] = index; should_skip_task_node = TRUE; } return should_skip_task_node; }
|
9.4.3 合并任务的 CMD44 下发
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
| int add_task_to_cq(u8 card_slot, u8 task_id) { int ret = 0; u16 block_len; if (card_info.merge_head_hash[card_slot][task_id] == 1) { block_len = card_info.merge_task_list[card_slot][task_id].total_blk_cnt; card_info.merge_task_list[card_slot][task_id].freeze = TRUE; u8 end_idx = card_info.merge_task_list[card_slot][task_id].end_index; emmc_firmware_task_queue[card_slot][end_idx].freeze = TRUE; } else { block_len = emmc_firmware_task_queue[card_slot][task_id].block_len; emmc_firmware_task_queue[card_slot][task_id].freeze = TRUE; } ret = cmd44_cq_set_task_params(card_slot, block_len, task_id, 0, !emmc_firmware_task_queue[card_slot][task_id].direction); if (ret) goto recovery; ret = cmd45_cq_set_task_address(card_slot, (u32)emmc_firmware_task_queue[card_slot][task_id].address); }
|
9.4.4 分段 NVMe DMA 传输
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
| void BB_Internal_Interrupt(int32 vec) { if (NVMe_DMA_Complete) { u8 card_slot = ; u8 head = card_info.executing_list_head; if (emmc_firmware_executing_task_id == card_info.merge_task_list[card_slot][head].end_index) { emmc_firmware_executing_task_id = card_info.merge_task_list[card_slot][head].head_index; BAYBRIDGE_INTERNAL_INTERRUPT_SIGNALING_MASK |= BIT4; SD_NORMAL_INTERRUPT_SIGNAL_ENABLE |= BIT1; } else { emmc_firmware_executing_task_id = emmc_firmware_task_queue[card_slot][emmc_firmware_executing_task_id].next; start_nvme_dma_for_next_merged_task(card_slot, emmc_firmware_executing_task_id); } } }
|
9.5 执行时序
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
| 优化后时序 ┌──────────────────────────────────────────────────────────────────────────────┐ │ NVMe Host BayBridge FW eMMC │ │ │ │ │ │ │ CMD[0] 1KB LBA=0 ───────►│ │ │ │ CMD[1] 1KB LBA=2 ───────►│ ← check_new_task_node() │ │ CMD[2] 1KB LBA=4 ───────►│ LBA 连续 → 合并 │ │ CMD[3] 1KB LBA=6 ───────►│ │ │ │ │ │ │ │ │ │ │── CMD44 (4KB) ──────►│ Slot 0 │ │ │ │── CMD45 (LBA=0) ────►│ (合并 4 个任务) │ │ │ │ │ │ │ │ │ │ │ │ │ NVMe DMA │◄── CMD46 Ready ─────│ │ │ │ [0] ─│─────────────────────►│ │ │ │ NVMe DMA │ │ │ │ │ [1] ─│─────────────────────►│ eMMC DMA │ │ │ NVMe DMA │ │ (持续传输) │ │ │ [2] ─│─────────────────────►│ │ │ │ NVMe DMA │ │ │ │ │ [3] ─│─────────────────────►│ │ │ │ │ │ │ │ │ │◄── eMMC DMA Done ───│ │ │ │ │ │ │ │ ◄─── CQ[0] Complete ────│ (合并后仅发送 1 个 │ │ │ ◄─── CQ[1] Complete ────│ eMMC 命令,4 个 │ │ │ ◄─── CQ[2] Complete ────│ NVMe Completion) │ │ │ ◄─── CQ[3] Complete ────│ │ │ │ │ │ │ │ └──────────────────────────────────────────────────────────────────────────────┘
性能对比: ┌─────────────────────┬──────────────┬──────────────┬─────────────┐ │ 场景 │ 优化前 │ 优化后 │ 提升幅度 │ ├─────────────────────┼──────────────┼──────────────┼─────────────┤ │ eMMC CQ 命令数 │ 32 (每个1KB) │ 8 (每个4KB) │ 4× │ │ 命令开销时间 │ 高 │ 低 │ ↓ │ │ 写入速度 │ ~20 MB/s │ ~60 MB/s │ 3× │ └─────────────────────┴──────────────┴──────────────┴─────────────┘
|
9.6 关键优化点总结
| 优化点 |
实现方式 |
作用 |
| 软件 Fetch |
修改硬件 NVMe Fetch 流程为软件控制 |
支持命令解析和合并判断 |
| LBA 连续检测 |
check_new_task_node() 检测前后任务地址 |
识别可合并的命令序列 |
| 合并链表 |
merge_task_list[32] 双向链表 |
管理合并后的任务节点 |
| 任务冻结 |
freeze 标志位 |
防止 CMD44 下发后继续合并 |
| 分段 NVMe DMA |
中断驱动的逐节点 DMA |
保证 NVMe 数据正确传输 |
| 合并阈值 |
BB_FETCH_ENTRY_NUMBER - 1 |
避免 NVMe 命令获取阻塞 |
| 轮询搜索 |
ready_task_search_begin |
避免任务饥饿 |
10. 附录:关键配置参数
1 2 3 4 5 6 7 8 9 10
| #define BB_CQ_DEPTH_DEFINE 0x20 #define BB_NON_CQ_DEPTH_DEFINE 0x20 #define BB_FETCH_ENTRY_NUMBER 4 #define BB_MAX_TRANSFER_MODE 0
#define DATA_BUFFER_SIZE (4*1024) #define CMD_BUFFER_SIZE (1024) #define PRP_SIZE (4*1024)
|