征程 5 VPS panic问题分析

地平线开发者 | 2024-11-28 22:18:11 阅读：53

01 背景

小概率出现 kernel 异常重启问题：3 个月内出现 3 例报 linux kernel 地址异常导致重启问题。通过场景分析，构造测试场景，并打开 ramdump 配置进行复测，抓到了问题现场。

02 crash 解析 ramdump

crash ./vmlinux /dev/random@0x80000000,DDRCS0-1.bin@0x80970000,DDRCS0-2.bin@0x100970000,DDRCS0-3.bin@0x140970000,DDRCS0-4.bin@0x180970000,DDRCS0-5.bin@0x1c0970000,DDRCS0-6.bin@0x200970000,DDRCS0-7.bin@0x240970000 --machdep vabits_actual=48

2.1 查看出错 log

通过 dmesg 命令抓取 log 缓存区，找到出错的日志和调用栈：


crash> dmesg
...
[ 1134.509848] ==================================================================
[ 1134.509888] BUG: KASAN: user-memory-access in cimdma_swap_buffer+0x2a4/0x4e0 [hobot_cim_dma]
[ 1134.510057] Write of size 1 at addr 0000050100000780 by task ipi6_thread/4052
[ 1134.510080]
[ 1134.510092] CPU: 1 PID: 4052 Comm: ipi6_thread Tainted: P           O      5.10.59-rt52-gbdf2977878dd-dirty #2
[ 1134.510119] Hardware name: Horizon AI Technologies, Inc. HOBOT j5 RHODE B2 & C & Ca & Cb & Cc & Cd & Ce (DT)
[ 1134.510138] Call trace:
[ 1134.510147]  dump_backtrace+0x0/0x2e0
[ 1134.510192]  show_stack+0x14/0x20
[ 1134.510224]  dump_stack+0xf8/0x160
[ 1134.510256]  kasan_report+0x1a8/0x200
[ 1134.510284]  __asan_store1+0x9c/0xa8
[ 1134.510308]  cimdma_swap_buffer+0x2a4/0x4e0 [hobot_cim_dma]
[ 1134.510443]  kthread+0x258/0x260
[ 1134.510475]  ret_from_fork+0x10/0x1c
[ 1134.510504] ==================================================================
...

[ 1134.690628] CPU: 6 PID: 4052 Comm: ipi6_thread Tainted: P    B      O      5.10.59-rt52-gbdf2977878dd-dirty #2
[ 1134.693905] Hardware name: Horizon AI Technologies, Inc. HOBOT j5 RHODE B2 & C & Ca & Cb & Cc & Cd & Ce (DT)
[ 1134.695165] pstate: 40c00005 (nZcv daif +PAN +UAO -TCO BTYPE=--)
[ 1134.695951] pc : cimdma_swap_buffer+0x2a4/0x4e0 [hobot_cim_dma]
[ 1134.696841] lr : cimdma_swap_buffer+0x2a4/0x4e0 [hobot_cim_dma]
[ 1134.697726] sp : ffff0001c0e5fc50
[ 1134.698163] pmr_save: 000000e0
[ 1134.698566] x29: ffff0001c0e5fc50 x28: 0000000000000000
[ 1134.699273] x27: 0000000000000000 x26: 0000000000000000
[ 1134.699977] x25: 000000000000004c x24: 0000050100000780
[ 1134.700682] x23: ffff000180e0b3c0 x22: ffff00020d4c0a50
[ 1134.701390] x21: ffff000180e0b1b0 x20: ffff000180e0b248
[ 1134.702099] x19: ffff000180e0b120 x18: 0000000000000000
[ 1134.702805] x17: 0000000000000000 x16: 0000000000000000
[ 1134.703509] x15: 0000000000000000 x14: 3d3d3d3d3d3d3d3d
[ 1134.704215] x13: 3d3d3d3d3d3d3d3d x12: ffff9400025cf1cf
[ 1134.704923] x11: 1ffff400025cf1ce x10: ffff9400025cf1ce
[ 1134.705631] x9 : dfffa00000000000 x8 : ffffa00012e78e70
[ 1134.706340] x7 : 0000000000000001 x6 : ffffa00012e78e70
[ 1134.707045] x5 : 00006bfffda30e32 x4 : dfffa00000000000
[ 1134.707754] x3 : ffffa00010c3c6a8 x2 : 0000000000000007
[ 1134.708459] x1 : ffff00017d1ec4c0 x0 : 0000000000000001
[ 1134.709166] Call trace:
[ 1134.709497]  cimdma_swap_buffer+0x2a4/0x4e0 [hobot_cim_dma]
[ 1134.710340]  kthread+0x258/0x260
[ 1134.710787]  ret_from_fork+0x10/0x1c
[ 1134.711288] Code: f94033e1 8b190300 387b4839 95d23efc (383c4b19)

通过日志中的信息可知，出错位置是 cimdma_swap_buffer+0x2a4，以及出错的线程 ipi6_thread=>pipeline 8。

2.2 定位出错代码行

加载 hobot_cim_dma.ko 符号表，并反汇编 cimdma_swap_buffer，找到偏移是 0x2a4（676）的指令行：

//加载符号表
crash> mod -s hobot_cim_dma /home/kaikai.sun/cimdma_ramdump/symbols/kernel/hobot_cim_dma.ko
     MODULE       NAME                       BASE           SIZE  OBJECT FILE
ffffa00008d97d80  hobot_cim_dma        ffffa00008d70000   184320  /home/kaikai.sun/cimdma_ramdump/symbols/kernel/hobot_cim_dma.ko

//反汇编
crash> dis -l cimdma_swap_buffer
/home/ycj/work/adnoa/software/adpro_j5_acore_public_origin/kernel/drivers/media/platform/hobot/cim_dma/hobot_cim_dma_ops.c: 1124
0xffffa00008d724a4:    ldr     x0, [sp, #120]
0xffffa00008d724a8:    bl      0xffffa00010202488 <__asan_load8>
0xffffa00008d724ac:    ldr     x0, [x22, #176]
0xffffa00008d724b0:    str     x0, [sp, #96]
0xffffa00008d724b4:    ldr     x0, [sp, #104]
0xffffa00008d724b8:    bl      0xffffa00010202488 <__asan_load8>
0xffffa00008d724bc:    ldr     x0, [sp, #96]
0xffffa00008d724c0:    ldr     x24, [x23, #168]
0xffffa00008d724c4:    add     x0, x0, x26
0xffffa00008d724c8:    bl      0xffffa00010202020 <__asan_load1>
0xffffa00008d724cc:    ldr     x1, [sp, #96]
0xffffa00008d724d0:    add     x0, x24, x25
0xffffa00008d724d4:    ldrb    w25, [x1, w27, uxtw]
0xffffa00008d724d8:    bl      0xffffa000102020c8 <__asan_store1>
0xffffa00008d724dc:    strb    w25, [x24, w28, uxtw]

出错的指令行是 strb w25， [x24， w28， uxtw]，结合 log 中的调用栈，x24: 0000050100000780，说明指令确实是出错点。

再来看 X24 是怎么赋值的：ldr x24， [x23， #168]，结合 log 调用和代码 hobot_cim_dma_ops.c:1124。

x23: ffff000180e0b3c0

emb_frame->buffer.info.addr[0][i] = frame->buffer.info.addr[1][i/2];

X23 的地址便是 emb_frame 的指针。

2.3 查看出错内存信息

查看 emb_frame 的内存信息：


crash> struct vio_frame 0xffff000180e0b3c0
struct vio_frame {
  list = {
    next = 0xffff00020d538000,
    prev = 0xffff00020d538a50
  },
  work = {
    node = {
      next = 0xffff000180e0b3d0,
      prev = 0xffff000180e0b3d0
    },
    func = 0xffff000180e0b3e0,
    worker = 0xffff000180e0b3e0,
    canceling = -2132757520
  },
  group = 0xffff000180e0b3f0,
  buffer = {
    ion_alloced = 30 '\036',
    ion_cached = 0 '\000',
    ion_cachesync = 0 '\000',
    consecutive_mode = 0 '\000',
    ion_mmap = 128 '\200',
    planeSize = {1281, 0, 1},
    info = {
      index = 0,
      planecount = 1,
      share_id = {0, 0, 0},
      planeSize = {1, 0, 1},
      paddr = {8589934595, 0, 4294967296},
      addr = {0x50100000780, 0x1e, 0x6}
    },
....
从上面的结构体解析地址来看，数值基本上不对；

crash> ptype /o struct vio_frame
/* offset    |  size */  type = struct vio_frame {
/*    0      |    16 */    struct list_head {
/*    0      |     8 */        struct list_head *next;
/*    8      |     8 */        struct list_head *prev;
....

/*  488      |    12 */    u32 paddr_buffer[3];
/*  500      |     4 */    enum vio_frame_state state;
/*  504      |     4 */    u32 instance;
/*  508      |     4 */    u32 fcount;
/*  512      |     4 */    u32 index;
/*  516      |     2 */    u16 dispatch_cnt;
/*  518      |     1 */    u8 iommu_map;
/*  519      |     1 */    u8 remote_buf;
/*  520      |     8 */    void *ext_data;

                           /* total size (bytes):  528 */
                         }

crash> rd ffff000180e0b3c0 -e ffff000180e0b600
ffff000180e0b3c0:  ffff00020d538000 ffff00020d538a50   ..S.....P.S.....
ffff000180e0b3d0:  ffff000180e0b3d0 ffff000180e0b3d0   ................
ffff000180e0b3e0:  ffff000180e0b3e0 ffff000180e0b3e0   ................
ffff000180e0b3f0:  ffff000180e0b3f0 ffff000180e0b3f0   ................
ffff000180e0b400:  000007800000001e 0000000000000501   ................
ffff000180e0b410:  0000000000000000 0000000000000001   ................
ffff000180e0b420:  0000000100000000 0000000000000000   ................
ffff000180e0b430:  0000000100000000 0000000000000001   ................
ffff000180e0b440:  0000000000000000 0000000000000001   ................
ffff000180e0b450:  0000000200000003 0000000000000000   ................
ffff000180e0b460:  0000000100000000 0000050100000780   ................
ffff000180e0b470:  000000000000001e 0000000000000006   ................
ffff000180e0b480:  0000000000000000 0000000000000001   ................
ffff000180e0b490:  0000000100000001 0000000100000780   ................
ffff000180e0b4a0:  0000000000000000 0000000000000000   ................
ffff000180e0b4b0:  0000000000000000 0000000000000000   ................
ffff000180e0b4c0:  1234567800000000 0000000000000000   ....xV4.........
ffff000180e0b4d0:  0000000000000000 0000000000000000   ................
ffff000180e0b4e0:  0000000000000000 0000000000000000   ................
ffff000180e0b4f0:  0000000000000000 0000000000000000   ................
ffff000180e0b500:  0000000000000000 0000000000000000   ................
ffff000180e0b510:  0000000000000000 0000000000000000   ................
ffff000180e0b520:  0000000000000000 00000000000080ac   ................
ffff000180e0b530:  000064b81c513364 0000000064b81c50   d3Q..d..P..d....
ffff000180e0b540:  00000000000e0011 0006000001010100   ................
ffff000180e0b550:  0000000000010001 0000000000000002   ................
ffff000180e0b560:  000080ad000080ad 0000000064b81c50   ........P..d....
ffff000180e0b570:  0000001c0000001e 0000000000000001   ................
ffff000180e0b580:  0000000064b81c50 0000001b0000001e   P..d............
ffff000180e0b590:  0000000000000001 0000000000000000   ................
ffff000180e0b5a0:  0000000000000000 0000000000000000   ................
ffff000180e0b5b0:  0000000000000001 ffff00017d1ec4c0   ...........}....
ffff000180e0b5c0:  0000000000000000 0000000000000000   ................
ffff000180e0b5d0:  0000000000000000 0000000000000000   ................
ffff000180e0b5e0:  0000000000000000 0000000000000000   ................
ffff000180e0b5f0:  ffff000180e0b5f0 ffff000180e0b5f0   ................

2.4 找到出错地址的保存位置

通过查看 0xffff000180e0b3c0 前后的内存信息，是用户设置的配置信息，都是保存在 struct cimdma_subdev 中，cimdma_subdev 是 struct j5_cimdma_dev 的成员变量。

struct j5_cimdma_dev {
    /* j5 cimdma information */
    struct platform_device *pdev;
    void __iomem *base_reg;
    resource_size_t regs_start;
    resource_size_t regs_end;
    s32 irq;
    unsigned long state;
    struct class *class;
    struct cdev cdev;
    dev_t devno;
    ...
    struct cimdma_subdev subdev[VIO_MAX_STREAM];
    struct vio_group *group[VIO_MAX_STREAM];
    struct vio_group_task gtask[VIO_MAX_STREAM];
    ...
    }
 //通过静态变量g_cimdma找到struct j5_cimdma_dev指针
 crash> g_cimdma
g_cimdma = $2 = (struct j5_cimdma_dev *) 0xffff000180e08080

crash> struct j5_cimdma_dev 0xffff000180e08080 -o
struct j5_cimdma_dev {
  [ffff000180e08080] struct platform_device *pdev;
  [ffff000180e08088] void *base_reg;
  [ffff000180e08090] resource_size_t regs_start;
  [ffff000180e08098] resource_size_t regs_end;
  [ffff000180e080a0] s32 irq;
  [ffff000180e080a8] unsigned long state;
  [ffff000180e080b0] struct class *class;
  [ffff000180e080b8] struct cdev cdev;
  [ffff000180e08120] dev_t devno;
  [ffff000180e08124] atomic_t instance;
  [ffff000180e08128] atomic_t rsccount;
  [ffff000180e0812c] atomic_t open_cnt;
  [ffff000180e08130] u32 sw_drop_count[16];
  [ffff000180e08170] u32 hw_drop_count[16];
  [ffff000180e081b0] raw_spinlock_t raw_slock;
  [ffff000180e081b8] struct mutex mlock;
  [ffff000180e081e0] atomic_t sensor_fcount[8];
  [ffff000180e08200] atomic_t backup_fcount[8];
  [ffff000180e08220] atomic_t enable_cnt[8];
  [ffff000180e08240] u32 cur_output_flag[8];
  [ffff000180e08260] struct cimdma_subdev subdev[16];
  [ffff000180e0dfe0] struct vio_group *group[16];
  [ffff000180e0e060] struct vio_group_task gtask[16];
  [ffff000180e0e760] u32 fusa_enable;
  [ffff000180e0e768] u64 jiffi;
  [ffff000180e0e770] struct vio_stl stl;
  [ffff000180e0e7a8] u32 last_frameid[8];
  [ffff000180e0e7c8] u32 error_cnt[8];
}
SIZE: 26472

由于通过之前的 log 已知出错的通路是 pipeline 8，对应的结构体是 subdev[8]，下一步查看 subdev[8]内存信息。

crash> struct cimdma_subdev ffff000180e08260 -o 9
....
struct cimdma_subdev {
  [ffff000180e0b120] struct vio_subdev vdev;
  [ffff000180e0b300] struct j5_cimdma_dev *cimdma;
  [ffff000180e0b308] wait_queue_head_t done_wq;
  [ffff000180e0b348] struct vio_framemgr emb_fmgr;
  [ffff000180e0b400] cim_dma_cfg_t cim_cfg;
  [ffff000180e0b4c8] struct frame_info preint_info;
  [ffff000180e0b548] u8 initial_frameid;
  [ffff000180e0b549] u8 yuv_format;
  [ffff000180e0b54a] u8 embeded_data;
  [ffff000180e0b54b] u8 embeded_dependence;
  [ffff000180e0b54c] u8 embeded_start_cnt;
  [ffff000180e0b54d] u8 pack_mode;
  [ffff000180e0b54e] u8 ipi_index;
  [ffff000180e0b54f] u8 tpg_en;
  [ffff000180e0b550] u8 reqbuf_flag;
  [ffff000180e0b551] u8 stop_flag;
  [ffff000180e0b552] u8 start_flag;
  [ffff000180e0b554] u32 cnt_shift;
  [ffff000180e0b558] u32 irq_status;
  [ffff000180e0b55c] u32 force_drop;
  [ffff000180e0b560] u32 sw_frameid;
  [ffff000180e0b564] u32 last_hw_frameid;
  [ffff000180e0b568] struct fps_debug fps[2];
  [ffff000180e0b598] fps_ctrl_t fps_ctrl;
  [ffff000180e0b5b0] u32 thread_run;
  [ffff000180e0b5b8] struct task_struct *cimdma_thread;
  [ffff000180e0b5c0] wait_queue_head_t cimdma_done_wq;
  [ffff000180e0b600] struct completion stop_complete;
  [ffff000180e0b620] struct vio_drop_mgr drop_mgr;
}
SIZE: 1496

X23：ffff000180e0b3c0 是在[ffff000180e0b348]struct vio_framemgr emb_fmgr 内，下一步查看 emb_fmgr 结构体信息。


crash> struct vio_framemgr ffff000180e0b348 -o
struct vio_framemgr {
  [ffff000180e0b348] u32 id;
  [ffff000180e0b34c] raw_spinlock_t raw_slock;
  [ffff000180e0b350] spinlock_t slock;
  [ffff000180e0b380] ulong sindex;
  [ffff000180e0b388] u32 num_frames;
  [ffff000180e0b38c] u32 num_buffers;
  [ffff000180e0b390] struct vio_frame *frames;
  [ffff000180e0b398] u32 queued_count[5];
  [ffff000180e0b3b0] struct list_head queued_list[5];
}
SIZE: 184

crash> list ffff000180e0b3f0
ffff000180e0b3f0
crash> struct list_head ffff000180e0b3b0 -o 5
struct list_head {
  [ffff000180e0b3b0] struct list_head *next;
  [ffff000180e0b3b8] struct list_head *prev;
}
SIZE: 16

struct list_head {
  [ffff000180e0b3c0] struct list_head *next;
  [ffff000180e0b3c8] struct list_head *prev;
}
SIZE: 16

struct list_head {
  [ffff000180e0b3d0] struct list_head *next;
  [ffff000180e0b3d8] struct list_head *prev;
}
SIZE: 16

struct list_head {
  [ffff000180e0b3e0] struct list_head *next;
  [ffff000180e0b3e8] struct list_head *prev;
}
SIZE: 16

struct list_head {
  [ffff000180e0b3f0] struct list_head *next;
  [ffff000180e0b3f8] struct list_head *prev;
}
SIZE: 16

2.5 定位原因

由此可知 X23:ffff000180e0b3c0 是 queued_list[1]的起始地址，queued_list 是 5 个 list 的 list head，queued_list[1]是 FS_REQUEST queue，对应代码 emb_frame 是从 FS_REQUEST 队列中获取，也就是说 peek_frame 拿到的是 FS_REQUEST 队列的 head。


static void cimdma_separate_embedded_data(struct cimdma_subdev *subdev)
{
....
    emb_fmgr = &subdev->emb_fmgr;
    vio_e_barrier_irqs(emb_fmgr, flags);/*PRQA S 2996*/
    emb_frame = peek_frame(emb_fmgr, FS_REQUEST);
    vio_x_barrier_irqr(emb_fmgr, flags);/*PRQA S 2996*/
    if (emb_frame == NULL) {
        vio_err("[S%d] emb FS_REQUEST queue has no member;\n", group->instance);
        framemgr_print_queues(emb_fmgr);
        return;
    }

    emb_frame->frameinfo.frame_id = frame->frameinfo.frame_id;
    emb_frame->frameinfo.timestamps = frame->frameinfo.timestamps;
    emb_frame->frameinfo.tv_sec = frame->frameinfo.tv_sec;
    emb_frame->frameinfo.tv_usec = frame->frameinfo.tv_usec;
    vio_frame_sync_for_cpu(frame);
    for (i = 0; i < subdev->cim_cfg.embeded_width; i++) {
        if (i % 2 == 0)
            emb_frame->buffer.info.addr[0][i] = frame->buffer.info.addr[1][i/2];
        else
            emb_frame->buffer.info.addr[0][i] = frame->buffer.info.addr[0][i/2];
    }    emb_fmgr = &subdev->emb_fmgr;
    vio_e_barrier_irqs(emb_fmgr, flags);/*PRQA S 2996*/
    emb_frame = peek_frame(emb_fmgr, FS_REQUEST);
    vio_x_barrier_irqr(emb_fmgr, flags);/*PRQA S 2996*/
    if (emb_frame == NULL) {
        vio_err("[S%d] emb FS_REQUEST queue has no member;\n", group->instance);
        framemgr_print_queues(emb_fmgr);
        return;
    }

    emb_frame->frameinfo.frame_id = frame->frameinfo.frame_id;
    emb_frame->frameinfo.timestamps = frame->frameinfo.timestamps;
    emb_frame->frameinfo.tv_sec = frame->frameinfo.tv_sec;
    emb_frame->frameinfo.tv_usec = frame->frameinfo.tv_usec;
    vio_frame_sync_for_cpu(frame);
    for (i = 0; i < subdev->cim_cfg.embeded_width; i++) {
        if (i % 2 == 0)
            emb_frame->buffer.info.addr[0][i] = frame->buffer.info.addr[1][i/2];
        else
            emb_frame->buffer.info.addr[0][i] = frame->buffer.info.addr[0][i/2];
    }
      ...
   }

通过以上可知，emb_fmgr 的链表操作存在的问题，应该是锁保护异常了，重新 review 代码发现确实是锁异常了，emb_fmgr 链表操作时使用了 framemgr 的 spinlock。

   static void cimdma_separate_embedded_data(struct cimdma_subdev *subdev)
   {
    ...
    vio_e_barrier_irqs(framemgr, flags);/*PRQA S 2996*/
    trans_frame(emb_fmgr, emb_frame, FS_COMPLETE);
    vio_x_barrier_irqr(framemgr, flags);/*PRQA S 2996*/
    wake_up(&subdev->done_wq);
    }

修改此次的锁异常，便能根本的修复该问题。

03 结论与反思

用锁保护临界资源是多进程并发问题的常用手段，但是锁保护的范围是否正确一直没有有效手段进行检查；在后续的项目或者芯片平台上，用锁保护得增加注释，方便自己其他同学检查，减少出错概率。

*博客内容为网友个人发布，仅代表博主个人观点，如有侵权请联系工作人员删除。