md事件计数(sb_events)

发布时间 2023-04-04 10:33:02作者: 只能爱你的疯子

1.总体流程

 

sb的更新会先计算出events的值后(++或--),更新需要load的硬盘的sb属性(sb_loaded标志),之后统一提交bio到硬盘。

值得一说的是,events计数并不一定是递增的,也可以回退。

2.events计算
 1   if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
 2         force_change = 1;
 3     if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
 4         /* just a clean<-> dirty transition, possibly leave spares alone,
 5          * though if events isn't the right even/odd, we will have to do
 6          * spares after all
 7          */
 8         nospares = 1;
 9     if (force_change)
10         nospares = 0;
11     if (mddev->degraded)
12         /* If the array is degraded, then skipping spares is both
13          * dangerous and fairly pointless.
14          * Dangerous because a device that was removed from the array
15          * might have a event_count that still looks up-to-date,
16          * so it can be re-added without a resync.
17          * Pointless because if there are any spares to skip,
18          * then a recovery will happen and soon that array won't
19          * be degraded any more and the spare can go back to sleep then.
20          */
21         nospares = 0;
22 
23     sync_req = mddev->in_sync;
24 
25     /* If this is just a dirty<->clean transition, and the array is clean
26      * and 'events' is odd, we can roll back to the previous clean state */
27     if (nospares
28         && (mddev->in_sync && mddev->recovery_cp == MaxSector)
29         && mddev->can_decrease_events
30         && mddev->events != 1) {
31         mddev->events--;
32         mddev->can_decrease_events = 0;
33     } else {
34         /* otherwise we have to go forward and ... */
35         mddev->events ++;
36         mddev->can_decrease_events = nospares;
37     }
38 
39     /*
40      * This 64-bit counter should never wrap.
41      * Either we are in around ~1 trillion A.C., assuming
42      * 1 reboot per second, or we have a bug...
43      */
44     WARN_ON(mddev->events == 0);

3.sb_loaded标志

 1 static void sync_sbs(struct mddev *mddev, int nospares)
 2 {
 3     /* Update each superblock (in-memory image), but
 4      * if we are allowed to, skip spares which already
 5      * have the right event counter, or have one earlier
 6      * (which would mean they aren't being marked as dirty
 7      * with the rest of the array)
 8      *
 9      * 更新每个超级块(内存映像中),
10      * 但如果允许的话,跳过已经有正确事件计数器的备用块,
11      * 或者更早有一个(这意味着它们不会被数组的其他部分标记为脏的)
12      */
13     struct md_rdev *rdev;
14     char b[BDEVNAME_SIZE];
15     /*
16      * events相等时不会去更新
17      * events不等的spare盘相差小于1时也不会去更新
18      */
19     rdev_for_each(rdev, mddev) {
20         if (rdev->sb_events == mddev->events ||
21             (nospares &&
22              rdev->raid_disk < 0 &&
23              rdev->sb_events+1 == mddev->events)) {
24             /* Don't update this superblock */
25             rdev->sb_loaded = 2;
26         } else {
27             sync_super(mddev, rdev);
28             rdev->sb_loaded = 1;
29         }33     }
34 }

按照上述的逻辑实现,热备盘是允许滞后一次计数(小于1),但是成员盘必须要全部更新。设置sb_loaded。

sync_super为赋值超块属性的函数,这里不做赘述。

4.更新超块

轮询same_set的所有硬盘(成员盘+热备盘),通过md_super_write下发bio到硬盘。

 1 void md_update_sb(struct mddev *mddev, int force_change)
 2 {
 3   ...
 4 
 5   rdev_for_each(rdev, mddev) {
 6         char b[BDEVNAME_SIZE];
 7         /* 判断sb_loaded值,当不为1时跳过不更新sb信息
 8            该值在sync_sbs中已被设置 */
 9         if (rdev->sb_loaded != 1){
10             pr_warn("md: md_update_sb disk:%s sb_loaded:%d!\n",
11                     bdevname(rdev->bdev,b),rdev->sb_loaded);
12             continue; /* no noise on spare devices */
13         }
14         /* 开始更新,已知坏盘不更新sb */
15         if (!test_bit(Faulty, &rdev->flags)) {
16             md_super_write(mddev,rdev,
17                        rdev->sb_start, rdev->sb_size,
18                        rdev->sb_page);
19             pr_debug("md: (write) %s's sb offset: %llu\n",
20                  bdevname(rdev->bdev, b),
21                  (unsigned long long)rdev->sb_start);
22             rdev->sb_events = mddev->events;
23             if (rdev->badblocks.size) {
24                 md_super_write(mddev, rdev,
25                            rdev->badblocks.sector,
26                            rdev->badblocks.size << 9,
27                            rdev->bb_page);
28                 rdev->badblocks.size = 0;
29             }
30 
31         } else
32             pr_warn("md: md_update_sb disk:%s (skipping faulty)\n",
33                  bdevname(rdev->bdev, b));
34         }
35     }
36     /* 等待pending值为0,即为所有盘更新超块结束 */
37     if (md_super_wait(mddev) < 0)
38         goto rewrite;
39   ...
40 }

下发bio:

 1 void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 2            sector_t sector, int size, struct page *page)
 3 {
 4     /* write first size bytes of page to sector of rdev
 5      * Increment mddev->pending_writes before returning
 6      * and decrement it on completion, waking up sb_wait
 7      * if zero is reached.
 8      * If an error occurred, call md_error
 9      */
10     struct bio *bio;
11     int ff = WRITE_FLUSH_FUA;
12 
13     if (!page)
14         return;
15 
16     if (test_bit(Faulty, &rdev->flags))
17         return;
18 
19     bio = md_bio_alloc_sync(mddev);
20 
21     atomic_inc(&rdev->nr_pending);
22 
23     bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
24     bio->bi_sector = sector;
25     bio_add_page(bio, page, size, 0);
26     bio->bi_private = rdev;
27     bio->bi_end_io = super_written;/* 结束回调函数 */
28 
29     if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
30         test_bit(FailFast, &rdev->flags) &&
31         !test_bit(LastDev, &rdev->flags))
32         ff |= MD_FAILFAST;
33     /* 更新pending值 ++ */
34     atomic_inc(&mddev->pending_writes);
35     submit_bio(ff, bio);
36 }

 

super_written中会调用pers->error_handler(不同raid模式的错误处理回调),下面以raid1(raid1_error)为例:

 1 static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
 2 {
 3     char b[BDEVNAME_SIZE];
 4     struct r1conf *conf = mddev->private;
 5     unsigned long flags;
 6 
 7     spin_lock_irqsave(&conf->device_lock, flags);
 8     if (test_bit(In_sync, &rdev->flags)
 9         && (conf->raid_disks - mddev->degraded) == 1) {
10         conf->recovery_disabled = mddev->recovery_disabled;
11         spin_unlock_irqrestore(&conf->device_lock, flags);
12         return;
13     }
14     set_bit(Blocked, &rdev->flags);
15     if (test_and_clear_bit(In_sync, &rdev->flags)) {
16         mddev->degraded++;
17         set_bit(Faulty, &rdev->flags);
18     } else
19         set_bit(Faulty, &rdev->flags);
20     spin_unlock_irqrestore(&conf->device_lock, flags);
21     /*
22      * if recovery is running, make sure it aborts.
23      */
24     set_bit(MD_RECOVERY_INTR, &mddev->recovery);
25     /*
26      * 这里设置了MD_SB_CHANGE_DEVS MD_SB_CHANGE_PENDING两个标,
27      * 会在结束后重新写入超块
28      *
29      * set_mask_bits代表:
30      * 先清除第二参数,然后设置第三参数,如果第二参数为0则不会清除任何,返回值为集合完成后的值
31      */
32     set_mask_bits(&mddev->sb_flags, 0,
33               BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
34 }

当有硬盘发生io错误后,会将盘状态置为faulty,并且设置MD_SB_CHANGE_DEVS MD_SB_CHANGE_PENDING,重新增加一次events计数(超块更新失败也算是一次事件问题,所以要再次更新events)。

 1 void md_update_sb(struct mddev *mddev, int force_change)
 2 {
 3 
 4   ...
 5 
 6   /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
 7     /* 超块写失败会走到这里重新尝试写,且events会再次+1 */
 8     /*
 9      * bit_clear_unless表示:
10      * 如果MD_SB_CHANGE_DEVS和MD_SB_CHANGE_CLEAN任何被置位,那么就不清除MD_SB_CHANGE_PENDING,
11      * 如果有,则MD_SB_CHANGE_PENDING被清除,
12      * 返回值:如果条件为真返回真,如果条件不成立则返回0
13      */
14     if (mddev->in_sync != sync_req ||
15         !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
16                    BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
17         /* have to write it out again 再更新一次sb*/
18         goto repeat;
19     wake_up(&mddev->sb_wait);
20 
21   ...
22 
23 }

 

5.总结

以上为events事件计数更新机制,从调用上看,事件计数不会频繁更新,当发生io错误,阵列变化时,会通过修改events值用于版本记录。当raid重新激活时,能够依据events值判断当前硬盘超块信息的可信度。

从逻辑上看,event值一定是存在以下几点:

  1. 正常硬盘间的events值不会 相差大于1的值(成员盘全部更新,热备盘允许落后一个数值)
  2. 热备盘的events一定不会比成员盘的数值大;

raid激活时,

激活时,允许event相差<2的硬盘重回整列,否则认为该硬盘数据不可靠,禁止使用。

目前遇到一个raid1激活失败的问题,所有成员盘的events落后热备盘的events 2个版本值,所以在重新激活时,成员盘全部被踢出整列,只留下热备盘导致raid1激活失败(热备盘的数据并不能单独激活)。从日志中均为找到任何的sb更新的io报错,而最近的一次问题出现中,与硬盘固件沟通是他们在自测过程中,存在驱动问题导致所有盘均为上线,但是当时raid1是被激活了,且后续该同事手动上线所有硬盘。沟通后是存在bio下发后未落盘也未报错的问题,这种情况是有可能导致有些盘未更新,有些盘更新的情况,但是暂为复现该问题。

 

 

/*