1.总体流程
sb的更新会先计算出events的值后(++或--),更新需要load的硬盘的sb属性(sb_loaded标志),之后统一提交bio到硬盘。
值得一说的是,events计数并不一定是递增的,也可以回退。
2.events计算1 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2 force_change = 1; 3 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 4 /* just a clean<-> dirty transition, possibly leave spares alone, 5 * though if events isn't the right even/odd, we will have to do 6 * spares after all 7 */ 8 nospares = 1; 9 if (force_change) 10 nospares = 0; 11 if (mddev->degraded) 12 /* If the array is degraded, then skipping spares is both 13 * dangerous and fairly pointless. 14 * Dangerous because a device that was removed from the array 15 * might have a event_count that still looks up-to-date, 16 * so it can be re-added without a resync. 17 * Pointless because if there are any spares to skip, 18 * then a recovery will happen and soon that array won't 19 * be degraded any more and the spare can go back to sleep then. 20 */ 21 nospares = 0; 22 23 sync_req = mddev->in_sync; 24 25 /* If this is just a dirty<->clean transition, and the array is clean 26 * and 'events' is odd, we can roll back to the previous clean state */ 27 if (nospares 28 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 29 && mddev->can_decrease_events 30 && mddev->events != 1) { 31 mddev->events--; 32 mddev->can_decrease_events = 0; 33 } else { 34 /* otherwise we have to go forward and ... */ 35 mddev->events ++; 36 mddev->can_decrease_events = nospares; 37 } 38 39 /* 40 * This 64-bit counter should never wrap. 41 * Either we are in around ~1 trillion A.C., assuming 42 * 1 reboot per second, or we have a bug... 43 */ 44 WARN_ON(mddev->events == 0);
3.sb_loaded标志
1 static void sync_sbs(struct mddev *mddev, int nospares) 2 { 3 /* Update each superblock (in-memory image), but 4 * if we are allowed to, skip spares which already 5 * have the right event counter, or have one earlier 6 * (which would mean they aren't being marked as dirty 7 * with the rest of the array) 8 * 9 * 更新每个超级块(内存映像中), 10 * 但如果允许的话,跳过已经有正确事件计数器的备用块, 11 * 或者更早有一个(这意味着它们不会被数组的其他部分标记为脏的) 12 */ 13 struct md_rdev *rdev; 14 char b[BDEVNAME_SIZE]; 15 /* 16 * events相等时不会去更新 17 * events不等的spare盘相差小于1时也不会去更新 18 */ 19 rdev_for_each(rdev, mddev) { 20 if (rdev->sb_events == mddev->events || 21 (nospares && 22 rdev->raid_disk < 0 && 23 rdev->sb_events+1 == mddev->events)) { 24 /* Don't update this superblock */ 25 rdev->sb_loaded = 2; 26 } else { 27 sync_super(mddev, rdev); 28 rdev->sb_loaded = 1; 29 }33 } 34 }
按照上述的逻辑实现,热备盘是允许滞后一次计数(小于1),但是成员盘必须要全部更新。设置sb_loaded。
sync_super为赋值超块属性的函数,这里不做赘述。
4.更新超块
轮询same_set的所有硬盘(成员盘+热备盘),通过md_super_write下发bio到硬盘。
1 void md_update_sb(struct mddev *mddev, int force_change) 2 { 3 ... 4 5 rdev_for_each(rdev, mddev) { 6 char b[BDEVNAME_SIZE]; 7 /* 判断sb_loaded值,当不为1时跳过不更新sb信息 8 该值在sync_sbs中已被设置 */ 9 if (rdev->sb_loaded != 1){ 10 pr_warn("md: md_update_sb disk:%s sb_loaded:%d!\n", 11 bdevname(rdev->bdev,b),rdev->sb_loaded); 12 continue; /* no noise on spare devices */ 13 } 14 /* 开始更新,已知坏盘不更新sb */ 15 if (!test_bit(Faulty, &rdev->flags)) { 16 md_super_write(mddev,rdev, 17 rdev->sb_start, rdev->sb_size, 18 rdev->sb_page); 19 pr_debug("md: (write) %s's sb offset: %llu\n", 20 bdevname(rdev->bdev, b), 21 (unsigned long long)rdev->sb_start); 22 rdev->sb_events = mddev->events; 23 if (rdev->badblocks.size) { 24 md_super_write(mddev, rdev, 25 rdev->badblocks.sector, 26 rdev->badblocks.size << 9, 27 rdev->bb_page); 28 rdev->badblocks.size = 0; 29 } 30 31 } else 32 pr_warn("md: md_update_sb disk:%s (skipping faulty)\n", 33 bdevname(rdev->bdev, b)); 34 } 35 } 36 /* 等待pending值为0,即为所有盘更新超块结束 */ 37 if (md_super_wait(mddev) < 0) 38 goto rewrite; 39 ... 40 }
下发bio:
1 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 2 sector_t sector, int size, struct page *page) 3 { 4 /* write first size bytes of page to sector of rdev 5 * Increment mddev->pending_writes before returning 6 * and decrement it on completion, waking up sb_wait 7 * if zero is reached. 8 * If an error occurred, call md_error 9 */ 10 struct bio *bio; 11 int ff = WRITE_FLUSH_FUA; 12 13 if (!page) 14 return; 15 16 if (test_bit(Faulty, &rdev->flags)) 17 return; 18 19 bio = md_bio_alloc_sync(mddev); 20 21 atomic_inc(&rdev->nr_pending); 22 23 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; 24 bio->bi_sector = sector; 25 bio_add_page(bio, page, size, 0); 26 bio->bi_private = rdev; 27 bio->bi_end_io = super_written;/* 结束回调函数 */ 28 29 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 30 test_bit(FailFast, &rdev->flags) && 31 !test_bit(LastDev, &rdev->flags)) 32 ff |= MD_FAILFAST; 33 /* 更新pending值 ++ */ 34 atomic_inc(&mddev->pending_writes); 35 submit_bio(ff, bio); 36 }
super_written中会调用pers->error_handler(不同raid模式的错误处理回调),下面以raid1(raid1_error)为例:
1 static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) 2 { 3 char b[BDEVNAME_SIZE]; 4 struct r1conf *conf = mddev->private; 5 unsigned long flags; 6 7 spin_lock_irqsave(&conf->device_lock, flags); 8 if (test_bit(In_sync, &rdev->flags) 9 && (conf->raid_disks - mddev->degraded) == 1) { 10 conf->recovery_disabled = mddev->recovery_disabled; 11 spin_unlock_irqrestore(&conf->device_lock, flags); 12 return; 13 } 14 set_bit(Blocked, &rdev->flags); 15 if (test_and_clear_bit(In_sync, &rdev->flags)) { 16 mddev->degraded++; 17 set_bit(Faulty, &rdev->flags); 18 } else 19 set_bit(Faulty, &rdev->flags); 20 spin_unlock_irqrestore(&conf->device_lock, flags); 21 /* 22 * if recovery is running, make sure it aborts. 23 */ 24 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 25 /* 26 * 这里设置了MD_SB_CHANGE_DEVS MD_SB_CHANGE_PENDING两个标, 27 * 会在结束后重新写入超块 28 * 29 * set_mask_bits代表: 30 * 先清除第二参数,然后设置第三参数,如果第二参数为0则不会清除任何,返回值为集合完成后的值 31 */ 32 set_mask_bits(&mddev->sb_flags, 0, 33 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 34 }
当有硬盘发生io错误后,会将盘状态置为faulty,并且设置MD_SB_CHANGE_DEVS MD_SB_CHANGE_PENDING,重新增加一次events计数(超块更新失败也算是一次事件问题,所以要再次更新events)。
1 void md_update_sb(struct mddev *mddev, int force_change) 2 { 3 4 ... 5 6 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 7 /* 超块写失败会走到这里重新尝试写,且events会再次+1 */ 8 /* 9 * bit_clear_unless表示: 10 * 如果MD_SB_CHANGE_DEVS和MD_SB_CHANGE_CLEAN任何被置位,那么就不清除MD_SB_CHANGE_PENDING, 11 * 如果有,则MD_SB_CHANGE_PENDING被清除, 12 * 返回值:如果条件为真返回真,如果条件不成立则返回0 13 */ 14 if (mddev->in_sync != sync_req || 15 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 16 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 17 /* have to write it out again 再更新一次sb*/ 18 goto repeat; 19 wake_up(&mddev->sb_wait); 20 21 ... 22 23 }
5.总结
以上为events事件计数更新机制,从调用上看,事件计数不会频繁更新,当发生io错误,阵列变化时,会通过修改events值用于版本记录。当raid重新激活时,能够依据events值判断当前硬盘超块信息的可信度。
从逻辑上看,event值一定是存在以下几点:
- 正常硬盘间的events值不会 相差大于1的值(成员盘全部更新,热备盘允许落后一个数值)
- 热备盘的events一定不会比成员盘的数值大;
raid激活时,
激活时,允许event相差<2的硬盘重回整列,否则认为该硬盘数据不可靠,禁止使用。
目前遇到一个raid1激活失败的问题,所有成员盘的events落后热备盘的events 2个版本值,所以在重新激活时,成员盘全部被踢出整列,只留下热备盘导致raid1激活失败(热备盘的数据并不能单独激活)。从日志中均为找到任何的sb更新的io报错,而最近的一次问题出现中,与硬盘固件沟通是他们在自测过程中,存在驱动问题导致所有盘均为上线,但是当时raid1是被激活了,且后续该同事手动上线所有硬盘。沟通后是存在bio下发后未落盘也未报错的问题,这种情况是有可能导致有些盘未更新,有些盘更新的情况,但是暂为复现该问题。
/*标签:md,MD,rdev,mddev,sb,events From: https://www.cnblogs.com/wdwlwzz/p/17284121.html